From 19d6b49b64e141639f649237570ce616a0122f9f Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackcul@yahoo-inc.com>
Date: Mon, 13 Oct 2014 23:45:16 +0000
Subject: [PATCH 01/10] tests pass on today's dev

---
 include/caffe/filler.hpp                      |  25 ++
 include/caffe/util/local_update.hpp           |  26 ++
 include/caffe/vision_layers.hpp               |  50 ++++
 src/caffe/layer_factory.cpp                   |   1 +
 .../layers/local_weighted_conv_layer.cpp      | 229 ++++++++++++++++++
 src/caffe/layers/local_weighted_conv_layer.cu | 200 +++++++++++++++
 src/caffe/proto/caffe.proto                   |  15 +-
 .../test/test_local_weighted_conv_layer.cpp   | 174 +++++++++++++
 src/caffe/util/local_update.cpp               |  56 +++++
 src/caffe/util/local_update.cu                |  85 +++++++
 10 files changed, 860 insertions(+), 1 deletion(-)
 create mode 100644 include/caffe/util/local_update.hpp
 create mode 100644 src/caffe/layers/local_weighted_conv_layer.cpp
 create mode 100644 src/caffe/layers/local_weighted_conv_layer.cu
 create mode 100644 src/caffe/test/test_local_weighted_conv_layer.cpp
 create mode 100644 src/caffe/util/local_update.cpp
 create mode 100644 src/caffe/util/local_update.cu
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 136ce958aed..46fb60992f4 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -158,6 +158,29 @@ class XavierFiller : public Filler<Dtype> {
 };
 
 
+template <typename Dtype>
+class TestLocalWeightConvolutionFiller : public Filler<Dtype> {
+ public:
+  explicit TestLocalWeightConvolutionFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    LOG(INFO) << "Doing mutable cpu";
+    LOG(INFO) << "blobs" << blob;
+    Dtype* data = blob->mutable_cpu_data();
+    LOG(INFO) << "Done Doing mutable cpu";
+    CHECK_EQ(blob->channels(), 1);
+
+    for (int n=0; n<blob->num(); n++) {
+      for (int j=0; j<blob->height(); j++) {
+        for (int i=0; i<blob->width(); i++) {
+          *(data+blob->offset(n, 0, j, i)) = i;
+        }
+      }
+    }
+  }
+};
+
+
 /**
  * @brief Get a specific filler from the specification given in FillerParameter.
  *
@@ -177,6 +200,8 @@ Filler<Dtype>* GetFiller(const FillerParameter& param) {
     return new UniformFiller<Dtype>(param);
   } else if (type == "xavier") {
     return new XavierFiller<Dtype>(param);
+  } else if (type == "test_local_weight_convolution") {
+    return new TestLocalWeightConvolutionFiller<Dtype>(param);
   } else {
     CHECK(false) << "Unknown filler name: " << param.type();
   }
diff --git a/include/caffe/util/local_update.hpp b/include/caffe/util/local_update.hpp
new file mode 100644
index 00000000000..ce9d1fd3b09
--- /dev/null
+++ b/include/caffe/util/local_update.hpp
@@ -0,0 +1,26 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifndef _CAFFE_UTIL_LOCAL_UPDATE_HPP_
+#define _CAFFE_UTIL_LOCAL_UPDATE_HPP_
+
+namespace caffe {
+
+template <typename Dtype>
+void local_update1_cpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
+	    const int location_num, const int output_num);
+
+template <typename Dtype>
+void local_update1_gpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
+	    const int location_num, const int output_num);
+
+template <typename Dtype>
+void local_update2_cpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
+	    const int location_num, const int output_num);
+
+template <typename Dtype>
+void local_update2_gpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
+	    const int location_num, const int output_num);
+
+}  // namespace caffe
+
+#endif  // _CAFFE_UTIL_LOCAL_UPDATE_HPP_
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index c803cd72449..e8fbcaf694c 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -112,6 +112,56 @@ class ConvolutionLayer : public Layer<Dtype> {
   Blob<Dtype> bias_multiplier_;
 };
 
+
+
+template <typename Dtype>
+class LocalWeightedConvolutionLayer : public Layer<Dtype> {
+ public:
+  explicit LocalWeightedConvolutionLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline LayerParameter_LayerType type() const {
+    return LayerParameter_LayerType_LOCAL_WEIGHTED_CONVOLUTION;
+  }
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline bool EqualNumBottomTopBlobs() const { return true; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+
+  int kernel_size_;
+  int stride_;
+  int num_;
+  int channels_;
+  int pad_;
+  int height_, width_;
+  int height_out_, width_out_;
+  int num_output_;
+  bool bias_term_;
+
+  int M_;
+  int K_;
+  int N_;
+
+  Blob<Dtype> col_buffer_;
+};
+
+
+
+
 #ifdef USE_CUDNN
 /*
  * @brief cuDNN implementation of ConvolutionLayer.
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 69863543c30..a638008ce01 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -185,5 +185,6 @@ REGISTER_LAYER_CLASS(SOFTMAX_LOSS, SoftmaxWithLossLayer);
 REGISTER_LAYER_CLASS(SPLIT, SplitLayer);
 REGISTER_LAYER_CLASS(THRESHOLD, ThresholdLayer);
 REGISTER_LAYER_CLASS(WINDOW_DATA, WindowDataLayer);
+REGISTER_LAYER_CLASS(LOCAL_WEIGHTED_CONVOLUTION, LocalWeightedConvolutionLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/local_weighted_conv_layer.cpp b/src/caffe/layers/local_weighted_conv_layer.cpp
new file mode 100644
index 00000000000..24712da1570
--- /dev/null
+++ b/src/caffe/layers/local_weighted_conv_layer.cpp
@@ -0,0 +1,229 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LocalWeightedConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(bottom.size(), 1) << "Conv Layer takes a single blob as input.";
+  CHECK_EQ(top.size(), 1) << "Conv Layer takes a single blob as output.";
+
+  kernel_size_ = this->layer_param_.local_weighted_convolution_param().kernel_size();
+  stride_ = this->layer_param_.local_weighted_convolution_param().stride();
+  pad_ = this->layer_param_.local_weighted_convolution_param().pad();
+  num_ = bottom[0]->num();
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  num_output_ = this->layer_param_.local_weighted_convolution_param().num_output();
+  CHECK_GT(num_output_, 0); 
+  CHECK_GE(height_, kernel_size_) << "height smaller than kernel size";
+  CHECK_GE(width_, kernel_size_) << "width smaller than kernel size";
+  // The im2col result buffer would only hold one image at a time to avoid
+  // overly large memory usage.
+  height_out_ = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+  width_out_ = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+  col_buffer_.Reshape(
+      1, channels_ * kernel_size_ * kernel_size_, height_out_, width_out_);
+  // Set the parameters
+  bias_term_ = this->layer_param_.local_weighted_convolution_param().bias_term();
+  // Figure out the dimensions for individual gemms.
+  M_ = num_output_;
+  K_ = channels_ * kernel_size_ * kernel_size_;
+  N_ = height_out_ * width_out_;
+  top[0]->Reshape(bottom[0]->num(), num_output_, height_out_, width_out_);
+  // Check if we need to set up the weights
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    if (bias_term_) {
+      this->blobs_.resize(2);
+    } else {
+      this->blobs_.resize(1);
+    }
+    // Intialize the weight
+    this->blobs_[0].reset(new Blob<Dtype>(
+        num_output_, 1, K_, N_));
+    // fill the weights
+    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
+        this->layer_param_.local_weighted_convolution_param().weight_filler()));
+    weight_filler->Fill(this->blobs_[0].get());
+    // If necessary, intiialize and fill the bias term
+    if (bias_term_) {
+      this->blobs_[1].reset(new Blob<Dtype>(1, 1, M_, N_));
+      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
+          this->layer_param_.local_weighted_convolution_param().bias_filler()));
+      bias_filler->Fill(this->blobs_[1].get());  
+    }
+  }
+}
+
+template <typename Dtype>
+void LocalWeightedConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+/*
+  num_ = bottom[0]->num();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
+    " convolution kernel.";
+  // TODO: generalize to handle inputs of different shapes.
+  for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
+    CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
+    CHECK_EQ(channels_, bottom[bottom_id]->channels())
+        << "Inputs must have same channels.";
+    CHECK_EQ(height_, bottom[bottom_id]->height())
+        << "Inputs must have same height.";
+    CHECK_EQ(width_, bottom[bottom_id]->width())
+        << "Inputs must have same width.";
+  }
+  // Shape the tops.
+  height_out_ =
+      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+  width_out_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+  }
+  // Prepare the matrix multiplication computation.
+  // Each input will be convolved as a single GEMM.
+  M_ = num_output_ / group_;
+  K_ = channels_ * kernel_h_ * kernel_w_ / group_;
+  N_ = height_out_ * width_out_;
+  // The im2col result buffer will only hold one image at a time to avoid
+  // overly large memory usage. In the special case of 1x1 convolution
+  // it goes lazily unused to save memory.
+  col_buffer_.Reshape(
+      1, channels_ * kernel_h_ * kernel_w_, height_out_, width_out_);
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+  }
+  // Set up the all ones "bias multiplier" for adding biases by BLAS
+  if (bias_term_) {
+    bias_multiplier_.Reshape(1, 1, 1, N_);
+    caffe_set(N_, Dtype(1), bias_multiplier_.mutable_cpu_data());
+  }
+*/
+}
+
+template <typename Dtype>
+void LocalWeightedConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+
+  Dtype* x_data = col_buffer_.mutable_cpu_data();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+
+  Blob<Dtype> E;
+  E.Reshape(1, 1, 1, K_);
+  FillerParameter filler_param;
+  filler_param.set_value(1);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(&E);
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, K_, N_);
+  for (int n=0; n<num_; n++) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    for (int m=0; m<num_output_; m++) { 
+      caffe_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
+                intermediate.mutable_cpu_data());
+
+      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
+                            (Dtype)1., E.cpu_data(),
+                            intermediate.cpu_data(),
+                            (Dtype)0., top_data + top[0]->offset(n, m));
+    }
+
+    if (bias_term_) {
+      caffe_add(M_ * N_, this->blobs_[1]->cpu_data(),
+                top_data + top[0]->offset(n),
+                top_data + top[0]->offset(n));
+    }
+  }
+}
+
+template <typename Dtype>
+void LocalWeightedConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* x_data = col_buffer_.mutable_cpu_data();
+  Dtype* x_diff = col_buffer_.mutable_cpu_diff();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+  Dtype* bias_diff = NULL;
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, 1, N_);
+
+  Blob<Dtype> xt;
+  xt.Reshape(1, 1, K_, N_);
+  Dtype* xt_data = xt.mutable_cpu_data();
+
+  if (bias_term_) {
+    bias_diff = this->blobs_[1]->mutable_cpu_diff();
+    memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count());
+    for (int n = 0; n < num_; ++n) {
+      caffe_add(M_ * N_, bias_diff,
+                top_diff + top[0]->offset(n),
+                bias_diff);
+    }
+  }
+
+  memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count());
+  for (int n=0; n<num_; n++) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    // gradient wrt weight
+    for (int m=0; m<num_output_; m++) {
+      Dtype* filter_weight_diff = weight_diff+this->blobs_[0]->offset(m);
+      for (int k=0; k<K_; k++) {
+        caffe_mul(N_, top_diff+top[0]->offset(n, m),  
+                  x_data+col_buffer_.offset(0,k), xt_data+xt.offset(0,0,k));
+      }
+      caffe_cpu_axpby(K_*N_, Dtype(1.0), xt_data, Dtype(1.0), filter_weight_diff);
+    }
+      
+    // gradient wrt bottom data
+    if (propagate_down[0]) {
+      memset(x_diff, 0, col_buffer_.count() * sizeof(Dtype));
+      for (int m=0; m<num_output_; m++) {
+        for (int k=0; k<K_; k++) {
+          caffe_mul(N_, top_diff+top[0]->offset(n, m),
+                    weight+this->blobs_[0]->offset(m,0,k),
+                    intermediate.mutable_cpu_data());
+
+          caffe_cpu_axpby(N_, Dtype(1.0),
+                          intermediate.cpu_data(), Dtype(1.0),
+                          x_diff+col_buffer_.offset(0,k));
+        }
+      }
+
+      // col2im back to the data
+      col2im_cpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
+                 pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
+
+    }
+  }
+
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LocalWeightedConvolutionLayer);
+#endif
+
+INSTANTIATE_CLASS(LocalWeightedConvolutionLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/local_weighted_conv_layer.cu b/src/caffe/layers/local_weighted_conv_layer.cu
new file mode 100644
index 00000000000..0e8bcf032bc
--- /dev/null
+++ b/src/caffe/layers/local_weighted_conv_layer.cu
@@ -0,0 +1,200 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/local_update.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+/// @brief refer to CPU forward -- the BLAS implementation is the same.
+template <typename Dtype>
+void LocalWeightedConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+
+  Dtype* x_data = col_buffer_.mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+
+  Blob<Dtype> E;
+  E.Reshape(1, 1, 1, K_);
+  FillerParameter filler_param;
+  filler_param.set_value(1);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(&E);
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, K_, N_);
+  for (int n=0; n<num_; n++) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    for (int m=0; m<num_output_; m++) {
+      caffe_gpu_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
+                    intermediate.mutable_gpu_data());
+
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
+                            (Dtype)1., E.gpu_data(), intermediate.gpu_data(),
+                            (Dtype)0., top_data + top[0]->offset(n, m));
+    }
+
+    if (bias_term_) {
+      caffe_gpu_add(M_ * N_, this->blobs_[1]->gpu_data(),
+                    top_data + top[0]->offset(n),
+                    top_data + top[0]->offset(n));
+    }
+  }
+
+}
+
+/// @brief refer to CPU backward -- the BLAS implementation is the same.
+template <typename Dtype>
+void LocalWeightedConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* x_data = col_buffer_.mutable_gpu_data();
+  Dtype* x_diff = col_buffer_.mutable_gpu_diff();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  Dtype* bias_diff = NULL;
+
+  Blob<Dtype> intermediate;
+  intermediate.Reshape(1, 1, 1, N_);
+
+  Blob<Dtype> xt;
+  xt.Reshape(1, 1, K_, N_);
+  Dtype* xt_data = xt.mutable_gpu_data();
+  if (bias_term_) {
+    bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    CUDA_CHECK(cudaMemset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count()));
+    for (int n = 0; n < num_; ++n) {
+      caffe_gpu_add(M_ * N_, bias_diff,
+                    top_diff + top[0]->offset(n),
+                    bias_diff);
+    }
+  }
+
+  Blob<Dtype> buf;
+  buf.Reshape(1, 1, K_, N_);
+  Dtype* buf_data = buf.mutable_gpu_data();
+  CUDA_CHECK(cudaMemset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count()));
+  for (int n=0; n<num_; n++) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);
+
+    local_update1_gpu(top_diff+top[0]->offset(n), x_data, weight_diff, K_, N_, M_);
+
+    if (propagate_down[0]) {
+      CUDA_CHECK(cudaMemset(x_diff, 0, col_buffer_.count() * sizeof(Dtype)));
+      local_update2_gpu(top_diff+top[0]->offset(n), weight, x_diff, K_, N_, M_);
+
+      // col2im back to the data
+      col2im_gpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
+                 pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
+    }
+
+
+
+  }
+
+
+
+
+
+
+
+/*
+  const Dtype* weight = NULL;
+  Dtype* weight_diff = NULL;
+  if (this->param_propagate_down_[0]) {
+    weight = this->blobs_[0]->gpu_data();
+    weight_diff = this->blobs_[0]->mutable_gpu_diff();
+    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
+  }
+  Dtype* bias_diff = NULL;
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
+  }
+  const int weight_offset = M_ * K_;
+  const int col_offset = K_ * N_;
+  const int top_offset = M_ * N_;
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = NULL;
+    // Bias gradient, if necessary.
+    if (bias_term_ && this->param_propagate_down_[1]) {
+      top_diff = top[i]->gpu_diff();
+      for (int n = 0; n < num_; ++n) {
+        caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
+            1., top_diff + top[0]->offset(n),
+            bias_multiplier_.gpu_data(), 1.,
+            bias_diff);
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      if (!top_diff) {
+        top_diff = top[i]->gpu_diff();
+      }
+      Dtype* col_buff = NULL;
+      if (!is_1x1_) {
+        col_buff = col_buffer_.mutable_gpu_data();
+      }
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      for (int n = 0; n < num_; ++n) {
+        // Since we saved memory in the forward pass by not storing all col
+        // data, we will need to recompute them.
+        if (!is_1x1_) {
+          im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
+                    width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+                    stride_h_, stride_w_, col_buff);
+        } else {
+          col_buff = bottom[i]->mutable_gpu_data() + bottom[i]->offset(n);
+        }
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          for (int g = 0; g < group_; ++g) {
+            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
+                (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
+                col_buff + col_offset * g, (Dtype)1.,
+                weight_diff + weight_offset * g);
+          }
+        }
+        // gradient w.r.t. bottom data, if necessary
+        if (propagate_down[i]) {
+          if (weight == NULL) {
+            weight = this->blobs_[0]->gpu_data();
+          }
+          if (is_1x1_) {
+            col_buff = bottom[i]->mutable_gpu_diff() + bottom[i]->offset(n);
+          }
+          for (int g = 0; g < group_; ++g) {
+            caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
+                (Dtype)1., weight + weight_offset * g,
+                top_diff + top[i]->offset(n) + top_offset * g,
+                (Dtype)0., col_buff + col_offset * g);
+          }
+          // col2im back to the data
+          if (!is_1x1_) {
+            col2im_gpu(col_buff, channels_, height_, width_,
+                kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+                bottom_diff + bottom[i]->offset(n));
+          }
+        }
+      }
+    }
+  }
+*/
+}
+
+
+INSTANTIATE_LAYER_GPU_FUNCS(LocalWeightedConvolutionLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index f0404a09b90..23472365e33 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -221,7 +221,7 @@ message LayerParameter {
   // line above the enum. Update the next available ID when you add a new
   // LayerType.
   //
-  // LayerType next available ID: 39 (last added: EXP)
+  // LayerType next available ID: 40 (last added: LOCAL_WEIGHTED_CONVOLUTION)
   enum LayerType {
     // "NONE" layer type is 0th enum element so that we don't cause confusion
     // by defaulting to an existent LayerType (instead, should usually error if
@@ -248,6 +248,7 @@ message LayerParameter {
     IMAGE_DATA = 12;
     INFOGAIN_LOSS = 13;
     INNER_PRODUCT = 14;
+    LOCAL_WEIGHTED_CONVOLUTION = 39;
     LRN = 15;
     MEMORY_DATA = 29;
     MULTINOMIAL_LOGISTIC_LOSS = 16;
@@ -309,6 +310,7 @@ message LayerParameter {
   optional ImageDataParameter image_data_param = 15;
   optional InfogainLossParameter infogain_loss_param = 16;
   optional InnerProductParameter inner_product_param = 17;
+  optional LocalWeightedConvolutionParameter local_weighted_convolution_param = 102;
   optional LRNParameter lrn_param = 18;
   optional MemoryDataParameter memory_data_param = 22;
   optional MVNParameter mvn_param = 34;
@@ -410,6 +412,17 @@ message ConvolutionParameter {
   optional Engine engine = 15 [default = DEFAULT];
 }
 
+// Message that stores parameters used by LocalWeightedConvolutionLayer
+message LocalWeightedConvolutionParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional uint32 pad = 3 [default = 0]; // The padding size
+  optional uint32 kernel_size = 4; // The kernel size
+  optional uint32 stride = 6 [default = 1]; // The stride
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+}
+
 // Message that stores parameters used by DataLayer
 message DataParameter {
   enum DB {
diff --git a/src/caffe/test/test_local_weighted_conv_layer.cpp b/src/caffe/test/test_local_weighted_conv_layer.cpp
new file mode 100644
index 00000000000..a0bd09676d2
--- /dev/null
+++ b/src/caffe/test/test_local_weighted_conv_layer.cpp
@@ -0,0 +1,174 @@
+// Copyright 2014 BVLC and contributors.
+
+#include <cstring>
+#include <vector>
+
+#include "cuda_runtime.h"
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+
+template <typename Dtype>
+class LocalWeightedConvolutionLayerTest : public ::testing::Test {
+ protected:
+	LocalWeightedConvolutionLayerTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    blob_bottom_->Reshape(2, 3, 6, 4);
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_value(1.);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+
+  virtual ~LocalWeightedConvolutionLayerTest() { delete blob_bottom_; delete blob_top_; }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(LocalWeightedConvolutionLayerTest, Dtypes);
+
+TYPED_TEST(LocalWeightedConvolutionLayerTest, TestSetup) {
+  LayerParameter layer_param;
+  LocalWeightedConvolutionParameter* convolution_param =
+      layer_param.mutable_local_weighted_convolution_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(4);
+  shared_ptr<Layer<TypeParam> > layer(
+      new LocalWeightedConvolutionLayer<TypeParam>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 4);
+  EXPECT_EQ(this->blob_top_->height(), 2);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+  convolution_param->set_num_output(3);
+  layer.reset(new LocalWeightedConvolutionLayer<TypeParam>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3);
+  EXPECT_EQ(this->blob_top_->height(), 2);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+}
+
+
+TYPED_TEST(LocalWeightedConvolutionLayerTest, TestCPUSimpleConvolution) {
+  // We will simply see if the convolution layer carries out averaging well.
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  ConstantFiller<TypeParam> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  LayerParameter layer_param;
+  LocalWeightedConvolutionParameter* convolution_param =
+      layer_param.mutable_local_weighted_convolution_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(1);
+  convolution_param->set_num_output(1);
+  convolution_param->mutable_weight_filler()->set_type("test_local_weight_convolution");
+  convolution_param->mutable_weight_filler()->set_value(1);
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<TypeParam> > layer(
+      new LocalWeightedConvolutionLayer<TypeParam>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  Caffe::set_mode(Caffe::CPU);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // After the convolution, the output should all have output values 27.1
+  const TypeParam* top_data = this->blob_top_->cpu_data();
+  for (int n=0; n<this->blob_top_->num(); n++) {
+    for (int k=0; k<this->blob_top_->channels(); k++) {
+      for (int j=0; j<this->blob_top_->height(); j++) {
+        for (int i=0; i<this->blob_top_->width(); i++) {
+          int idx = j*this->blob_top_->width()+i;
+          EXPECT_NEAR(*(top_data+this->blob_top_->offset(n, k, j, i)), idx*27+0.1, 1e-4);
+        }
+      }
+    }
+  }
+}
+
+
+TYPED_TEST(LocalWeightedConvolutionLayerTest, TestGPUSimpleConvolution) {
+  // We will simply see if the convolution layer carries out averaging well.
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  ConstantFiller<TypeParam> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  LayerParameter layer_param;
+  LocalWeightedConvolutionParameter* convolution_param =
+      layer_param.mutable_local_weighted_convolution_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("test_local_weight_convolution");
+  convolution_param->mutable_weight_filler()->set_value(1);
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<TypeParam> > layer(
+      new LocalWeightedConvolutionLayer<TypeParam>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  Caffe::set_mode(Caffe::GPU);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // After the convolution, the output should all have output values 27.1
+  const TypeParam* top_data = this->blob_top_->cpu_data();
+  for (int n=0; n<this->blob_top_->num(); n++) {
+    for (int k=0; k<this->blob_top_->channels(); k++) {
+      for (int j=0; j<this->blob_top_->height(); j++) {
+        for (int i=0; i<this->blob_top_->width(); i++) {
+          int idx = j*this->blob_top_->width()+i;
+          EXPECT_NEAR(*(top_data+this->blob_top_->offset(n, k, j, i)), idx*27+0.1, 1e-4);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(LocalWeightedConvolutionLayerTest, TestCPUGradient) {
+  LayerParameter layer_param;
+  LocalWeightedConvolutionParameter* convolution_param =
+      layer_param.mutable_local_weighted_convolution_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  Caffe::set_mode(Caffe::CPU);
+  LocalWeightedConvolutionLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(LocalWeightedConvolutionLayerTest, TestGPUGradient) {
+  LayerParameter layer_param;
+  LocalWeightedConvolutionParameter* convolution_param =
+      layer_param.mutable_local_weighted_convolution_param();
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  Caffe::set_mode(Caffe::GPU);
+  LocalWeightedConvolutionLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/util/local_update.cpp b/src/caffe/util/local_update.cpp
new file mode 100644
index 00000000000..69e605e1110
--- /dev/null
+++ b/src/caffe/util/local_update.cpp
@@ -0,0 +1,56 @@
+// Copyright 2014 BVLC and contributors.
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include "caffe/util/local_update.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void local_update1_cpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  int total = filter_num * location_num * output_num;
+  for (int index=0; index<total; index++) {
+    int p = index % location_num;
+    int n = (index / location_num);
+    data_R[index] = Dtype(0);
+
+    for (int q=0; q<output_num; q++) {
+      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
+    }
+  }
+}
+
+// Explicit instantiation
+template void local_update1_cpu<float>(const float* data_A, const float* data_B,
+                                   float* data_R, const int filter_num,
+                                   const int location_num, const int output_num);
+template void local_update1_cpu<double>(const double* data_A, const double* data_B,
+                                   double* data_R, const int filter_num,
+                                   const int location_num, const int output_num);
+
+template <typename Dtype>
+void local_update2_cpu(const Dtype* data_A, const Dtype* data_B,
+                       Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  int total = filter_num * location_num ;
+  for (int index=0; index<total; index++) {
+    int p = index % location_num;
+    int n = (index / location_num);
+    for (int q=0; q<output_num; q++) {
+      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
+    }
+  }
+}
+
+// Explicit instantiation
+template void local_update2_cpu<float>(const float* data_A, const float* data_B,
+                                   float* data_R, const int filter_num,
+                                   const int location_num, const int output_num);
+template void local_update2_cpu<double>(const double* data_A, const double* data_B,
+                                   double* data_R, const int filter_num,
+                                   const int location_num, const int output_num);
+
+}  // namespace caffe
diff --git a/src/caffe/util/local_update.cu b/src/caffe/util/local_update.cu
new file mode 100644
index 00000000000..f74d9bbea37
--- /dev/null
+++ b/src/caffe/util/local_update.cu
@@ -0,0 +1,85 @@
+// Copyright 2014 BVLC and contributors.
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include "caffe/common.hpp"
+#include "caffe/util/local_update.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void local_update1_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
+                                    Dtype* data_R, const int filter_num,
+                                    const int location_num, const int output_num) {
+  int total = filter_num * location_num * output_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num) % filter_num;
+    int q = (index / location_num) / filter_num;
+    data_R[index] += data_A[q*location_num+p] * data_B[n*location_num+p];
+  }
+}
+
+template <typename Dtype>
+void local_update1_gpu(const Dtype* data_A, const Dtype* data_B,
+                       Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is filter_num x location_num
+  // data_R is output_num x filter_num x location_num, the update performed is Rqnp += Aqp * Bnp
+
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  local_update1_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num * output_num),
+                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update1_gpu<float>(const float* data_A, const float* data_B,
+                                float* data_R, const int filter_num,
+                                const int location_num, const int output_num);
+template void local_update1_gpu<double>(const double* data_A, const double* data_B,
+                                double* data_R, const int filter_num,
+                                const int location_num, const int output_num);
+
+
+template <typename Dtype>
+__global__ void local_update2_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
+                                Dtype* data_R, const int filter_num,
+                                const int location_num, const int output_num) {
+  int total = filter_num * location_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num);
+    for (int q=0; q<output_num; q++) {
+      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
+    }
+  }
+}
+
+template <typename Dtype>
+void local_update2_gpu(const Dtype* data_A, const Dtype* data_B,
+                       Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is output_num x filter_num x location_num
+  // data_R is filter_num x location_num, the update performed is Rnp += \sum_q(Aqp * Bqnp)
+
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  local_update2_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num),
+                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update2_gpu<float>(const float* data_A, const float* data_B,
+                       float* data_R, const int filter_num,
+                       const int location_num, const int output_num);
+template void local_update2_gpu<double>(const double* data_A, const double* data_B,
+                       double* data_R, const int filter_num,
+                       const int location_num, const int output_num);
+
+}  // namespace caffe

From d732faf59962a3dfd5a49a969ccd6c45ba202960 Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackcul@yahoo-inc.com>
Date: Tue, 14 Oct 2014 00:08:39 +0000
Subject: [PATCH 02/10] remove vestigal bits of conv layer

---
 src/caffe/layers/local_weighted_conv_layer.cu | 94 -------------------
 1 file changed, 94 deletions(-)

diff --git a/src/caffe/layers/local_weighted_conv_layer.cu b/src/caffe/layers/local_weighted_conv_layer.cu
index 0e8bcf032bc..660b04fca4c 100644
--- a/src/caffe/layers/local_weighted_conv_layer.cu
+++ b/src/caffe/layers/local_weighted_conv_layer.cu
@@ -55,7 +55,6 @@ template <typename Dtype>
 void LocalWeightedConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 
-
   const Dtype* top_diff = top[0]->gpu_diff();
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
@@ -99,102 +98,9 @@ void LocalWeightedConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>
       col2im_gpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
                  pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));
     }
-
-
-
   }
-
-
-
-
-
-
-
-/*
-  const Dtype* weight = NULL;
-  Dtype* weight_diff = NULL;
-  if (this->param_propagate_down_[0]) {
-    weight = this->blobs_[0]->gpu_data();
-    weight_diff = this->blobs_[0]->mutable_gpu_diff();
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  Dtype* bias_diff = NULL;
-  if (bias_term_ && this->param_propagate_down_[1]) {
-    bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
-  }
-  const int weight_offset = M_ * K_;
-  const int col_offset = K_ * N_;
-  const int top_offset = M_ * N_;
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = NULL;
-    // Bias gradient, if necessary.
-    if (bias_term_ && this->param_propagate_down_[1]) {
-      top_diff = top[i]->gpu_diff();
-      for (int n = 0; n < num_; ++n) {
-        caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
-            1., top_diff + top[0]->offset(n),
-            bias_multiplier_.gpu_data(), 1.,
-            bias_diff);
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      if (!top_diff) {
-        top_diff = top[i]->gpu_diff();
-      }
-      Dtype* col_buff = NULL;
-      if (!is_1x1_) {
-        col_buff = col_buffer_.mutable_gpu_data();
-      }
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      for (int n = 0; n < num_; ++n) {
-        // Since we saved memory in the forward pass by not storing all col
-        // data, we will need to recompute them.
-        if (!is_1x1_) {
-          im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-                    width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-                    stride_h_, stride_w_, col_buff);
-        } else {
-          col_buff = bottom[i]->mutable_gpu_data() + bottom[i]->offset(n);
-        }
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
-                (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
-                col_buff + col_offset * g, (Dtype)1.,
-                weight_diff + weight_offset * g);
-          }
-        }
-        // gradient w.r.t. bottom data, if necessary
-        if (propagate_down[i]) {
-          if (weight == NULL) {
-            weight = this->blobs_[0]->gpu_data();
-          }
-          if (is_1x1_) {
-            col_buff = bottom[i]->mutable_gpu_diff() + bottom[i]->offset(n);
-          }
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
-                (Dtype)1., weight + weight_offset * g,
-                top_diff + top[i]->offset(n) + top_offset * g,
-                (Dtype)0., col_buff + col_offset * g);
-          }
-          // col2im back to the data
-          if (!is_1x1_) {
-            col2im_gpu(col_buff, channels_, height_, width_,
-                kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-                bottom_diff + bottom[i]->offset(n));
-          }
-        }
-      }
-    }
-  }
-*/
 }
 
-
 INSTANTIATE_LAYER_GPU_FUNCS(LocalWeightedConvolutionLayer);
 
 }  // namespace caffe

From 1e13b50965c96fd89ddaa610c9495544464ebaaa Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackcul@yahoo-inc.com>
Date: Fri, 17 Oct 2014 18:31:59 +0000
Subject: [PATCH 03/10] example .prototxt files with local layers

---
 examples/mnist/lenet_local_solver.prototxt    |  25 +++
 .../mnist/lenet_local_train_test.prototxt     | 171 ++++++++++++++++++
 examples/mnist/train_lenet.sh                 |   2 +-
 examples/mnist/train_lenet_local.sh           |   3 +
 4 files changed, 200 insertions(+), 1 deletion(-)
 create mode 100644 examples/mnist/lenet_local_solver.prototxt
 create mode 100644 examples/mnist/lenet_local_train_test.prototxt
 create mode 100755 examples/mnist/train_lenet_local.sh

diff --git a/examples/mnist/lenet_local_solver.prototxt b/examples/mnist/lenet_local_solver.prototxt
new file mode 100644
index 00000000000..a4b02d25e7e
--- /dev/null
+++ b/examples/mnist/lenet_local_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/mnist/lenet_local_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0005
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 10000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/mnist/lenet"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/mnist/lenet_local_train_test.prototxt b/examples/mnist/lenet_local_train_test.prototxt
new file mode 100644
index 00000000000..149cd8a53a9
--- /dev/null
+++ b/examples/mnist/lenet_local_train_test.prototxt
@@ -0,0 +1,171 @@
+name: "LeNet"
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_train_lmdb"
+    backend: LMDB
+    batch_size: 64
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TRAIN }
+}
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_test_lmdb"
+    backend: LMDB
+    batch_size: 100
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TEST }
+}
+
+layers {
+  name: "conv1"
+  type: CONVOLUTION
+  bottom: "data"
+  top: "conv1"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "pool1"
+  type: POOLING
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layers {
+  name: "local1"
+  type: LOCAL_WEIGHTED_CONVOLUTION
+  bottom: "pool1"
+  top: "local1"
+  blobs_lr: 1
+  blobs_lr: 1
+  local_weighted_convolution_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "local1"
+  top: "local1"
+}
+layers {
+  name: "local2"
+  type: LOCAL_WEIGHTED_CONVOLUTION
+  bottom: "local1"
+  top: "local2"
+  blobs_lr: 1
+  blobs_lr: 1
+  local_weighted_convolution_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "local2"
+  top: "local2"
+}
+layers {
+  name: "ip1"
+  type: INNER_PRODUCT
+  bottom: "local2"
+  top: "ip1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  name: "ip2"
+  type: INNER_PRODUCT
+  bottom: "ip1"
+  top: "ip2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "accuracy"
+  type: ACCURACY
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include: { phase: TEST }
+}
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/mnist/train_lenet.sh b/examples/mnist/train_lenet.sh
index 1b6bf7d978d..1f718825ded 100755
--- a/examples/mnist/train_lenet.sh
+++ b/examples/mnist/train_lenet.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env sh
 
-./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
+GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
diff --git a/examples/mnist/train_lenet_local.sh b/examples/mnist/train_lenet_local.sh
new file mode 100755
index 00000000000..b9e29e5ceb8
--- /dev/null
+++ b/examples/mnist/train_lenet_local.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_local_solver.prototxt --gpu=1

From 23a0d016e1fe608989a14c88788b48e353367b82 Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackcul@yahoo-inc.com>
Date: Fri, 17 Oct 2014 20:13:44 +0000
Subject: [PATCH 04/10] prototxt files showing a siamese network with two local
 layers i also added another contrastive loss after the first local layer,
 because it seems to plateau otherwise

---
 .../mnist_siamese_local_solver.prototxt       |  25 +
 .../mnist_siamese_local_train_test.prototxt   | 435 ++++++++++++++++++
 examples/siamese/train_mnist_siamese_local.sh |   5 +
 3 files changed, 465 insertions(+)
 create mode 100644 examples/siamese/mnist_siamese_local_solver.prototxt
 create mode 100644 examples/siamese/mnist_siamese_local_train_test.prototxt
 create mode 100755 examples/siamese/train_mnist_siamese_local.sh

diff --git a/examples/siamese/mnist_siamese_local_solver.prototxt b/examples/siamese/mnist_siamese_local_solver.prototxt
new file mode 100644
index 00000000000..c85bb90fc87
--- /dev/null
+++ b/examples/siamese/mnist_siamese_local_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/siamese/mnist_siamese_local_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0000
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 50000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/siamese/mnist_siamese"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/siamese/mnist_siamese_local_train_test.prototxt b/examples/siamese/mnist_siamese_local_train_test.prototxt
new file mode 100644
index 00000000000..37df99abcbe
--- /dev/null
+++ b/examples/siamese/mnist_siamese_local_train_test.prototxt
@@ -0,0 +1,435 @@
+name: "mnist_siamese_train_test"
+layers {
+  name: "pair_data"
+  type: DATA
+  top: "pair_data"
+  top: "sim"
+  data_param {
+    source: "examples/siamese/mnist_siamese_train_leveldb"
+    scale: 0.00390625
+    batch_size: 64
+  }
+  include: { phase: TRAIN }
+}
+layers {
+  name: "pair_data"
+  type: DATA
+  top: "pair_data"
+  top: "sim"
+  data_param {
+    source: "examples/siamese/mnist_siamese_test_leveldb"
+    scale: 0.00390625
+    batch_size: 100
+  }
+  include: { phase: TEST }
+}
+layers {
+    name: "slice_pair"
+    type: SLICE
+    bottom: "pair_data"
+    top: "data"
+    top: "data_p"
+    slice_param {
+        slice_dim: 1
+        slice_point: 1
+    }
+}
+
+
+
+
+layers {
+  name: "conv1"
+  type: CONVOLUTION
+  bottom: "data"
+  top: "conv1"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "conv1_w"
+  param: "conv1_b"
+}
+layers {
+  name: "pool1"
+  type: POOLING
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+
+
+layers {
+  name: "local1"
+  type: LOCAL_WEIGHTED_CONVOLUTION
+  bottom: "pool1"
+  top: "local1"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_weighted_convolution_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local1_w"
+  param: "local1_b"
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "local1"
+  top: "local1"
+}
+layers {
+  name: "local2"
+  type: LOCAL_WEIGHTED_CONVOLUTION
+  bottom: "local1"
+  top: "local2"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_weighted_convolution_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local2_w"
+  param: "local2_b"
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "local2"
+  top: "local2"
+}
+
+
+
+
+
+
+
+layers {
+  name: "ip1"
+  type: INNER_PRODUCT
+  bottom: "local2"
+  top: "ip1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip1_w"
+  param: "ip1_b"
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  name: "ip2"
+  type: INNER_PRODUCT
+  bottom: "ip1"
+  top: "ip2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip2_w"
+  param: "ip2_b"
+}
+
+layers {
+  name: "feat2"
+  type: INNER_PRODUCT
+  bottom: "ip2"
+  top: "feat2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat2_w"
+  param: "feat2_b"
+}
+
+layers {
+  name: "feat1"
+  type: INNER_PRODUCT
+  bottom: "local1"
+  top: "feat1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat1_w"
+  param: "feat1_b"
+}
+
+
+
+layers {
+  name: "conv1_p"
+  type: CONVOLUTION
+  bottom: "data_p"
+  top: "conv1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "conv1_w"
+  param: "conv1_b"
+}
+layers {
+  name: "pool1_p"
+  type: POOLING
+  bottom: "conv1_p"
+  top: "pool1_p"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+
+
+layers {
+  name: "local1_p"
+  type: LOCAL_WEIGHTED_CONVOLUTION
+  bottom: "pool1_p"
+  top: "local1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_weighted_convolution_param {
+    num_output: 5
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local1_w"
+  param: "local1_b"
+}
+layers {
+  name: "relu1_p"
+  type: RELU
+  bottom: "local1_p"
+  top: "local1_p"
+}
+layers {
+  name: "local2_p"
+  type: LOCAL_WEIGHTED_CONVOLUTION
+  bottom: "local1_p"
+  top: "local2_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  local_weighted_convolution_param {
+    num_output: 10
+    kernel_size: 5
+    stride: 1
+    pad: 0
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+  param: "local2_w"
+  param: "local2_b"
+}
+layers {
+  name: "relu2_p"
+  type: RELU
+  bottom: "local2_p"
+  top: "local2_p"
+}
+
+
+
+
+
+layers {
+  name: "ip1_p"
+  type: INNER_PRODUCT
+  bottom: "local2_p"
+  top: "ip1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip1_w"
+  param: "ip1_b"
+}
+layers {
+  name: "relu1_p"
+  type: RELU
+  bottom: "ip1_p"
+  top: "ip1_p"
+}
+layers {
+  name: "ip2_p"
+  type: INNER_PRODUCT
+  bottom: "ip1_p"
+  top: "ip2_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "ip2_w"
+  param: "ip2_b"
+}
+
+layers {
+  name: "feat2_p"
+  type: INNER_PRODUCT
+  bottom: "ip2_p"
+  top: "feat2_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat2_w"
+  param: "feat2_b"
+}
+
+
+layers {
+  name: "feat1_p"
+  type: INNER_PRODUCT
+  bottom: "local1_p"
+  top: "feat1_p"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+  param: "feat1_w"
+  param: "feat1_b"
+}
+
+
+layers {
+    name: "loss2"
+    type: CONTRASTIVE_LOSS
+    contrastive_loss_param {
+        margin: 1.0
+    }
+    bottom: "feat2"
+    bottom: "feat2_p"
+    bottom: "sim"
+    top: "loss2"
+}
+
+
+layers {
+    name: "loss1"
+    type: CONTRASTIVE_LOSS
+    contrastive_loss_param {
+        margin: 1.0
+    }
+    bottom: "feat1"
+    bottom: "feat1_p"
+    bottom: "sim"
+    top: "loss1"
+}
+
diff --git a/examples/siamese/train_mnist_siamese_local.sh b/examples/siamese/train_mnist_siamese_local.sh
new file mode 100755
index 00000000000..83035bf2ccc
--- /dev/null
+++ b/examples/siamese/train_mnist_siamese_local.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env sh
+
+TOOLS=./build/tools
+
+GLOG_logtostderr=0 GLOG_log_dir=examples/siamese/ $TOOLS/caffe train --solver=examples/siamese/mnist_siamese_local_solver.prototxt

From 5082b4ad4ad5e3738c3188918e8e438918e86586 Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackcul@yahoo-inc.com>
Date: Fri, 17 Oct 2014 21:00:51 +0000
Subject: [PATCH 05/10] fix Reshape() to take on the appropriate tasks

---
 .../layers/local_weighted_conv_layer.cpp      | 50 ++++++-------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/src/caffe/layers/local_weighted_conv_layer.cpp b/src/caffe/layers/local_weighted_conv_layer.cpp
index 24712da1570..9b41be14c42 100644
--- a/src/caffe/layers/local_weighted_conv_layer.cpp
+++ b/src/caffe/layers/local_weighted_conv_layer.cpp
@@ -22,22 +22,20 @@ void LocalWeightedConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();
   num_output_ = this->layer_param_.local_weighted_convolution_param().num_output();
-  CHECK_GT(num_output_, 0); 
-  CHECK_GE(height_, kernel_size_) << "height smaller than kernel size";
-  CHECK_GE(width_, kernel_size_) << "width smaller than kernel size";
-  // The im2col result buffer would only hold one image at a time to avoid
-  // overly large memory usage.
+
   height_out_ = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
   width_out_ = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
-  col_buffer_.Reshape(
-      1, channels_ * kernel_size_ * kernel_size_, height_out_, width_out_);
-  // Set the parameters
-  bias_term_ = this->layer_param_.local_weighted_convolution_param().bias_term();
-  // Figure out the dimensions for individual gemms.
+
   M_ = num_output_;
   K_ = channels_ * kernel_size_ * kernel_size_;
   N_ = height_out_ * width_out_;
-  top[0]->Reshape(bottom[0]->num(), num_output_, height_out_, width_out_);
+
+  CHECK_GT(num_output_, 0); 
+  CHECK_GE(height_, kernel_size_) << "height smaller than kernel size";
+  CHECK_GE(width_, kernel_size_) << "width smaller than kernel size";
+  // Set the parameters
+  bias_term_ = this->layer_param_.local_weighted_convolution_param().bias_term();
+
   // Check if we need to set up the weights
   if (this->blobs_.size() > 0) {
     LOG(INFO) << "Skipping parameter initialization";
@@ -67,12 +65,8 @@ void LocalWeightedConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>
 template <typename Dtype>
 void LocalWeightedConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-/*
-  num_ = bottom[0]->num();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
   CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
-    " convolution kernel.";
+    " weights.";
   // TODO: generalize to handle inputs of different shapes.
   for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
     CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
@@ -83,32 +77,20 @@ void LocalWeightedConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& b
     CHECK_EQ(width_, bottom[bottom_id]->width())
         << "Inputs must have same width.";
   }
+
   // Shape the tops.
-  height_out_ =
-      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
-  width_out_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
   for (int top_id = 0; top_id < top.size(); ++top_id) {
     top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
   }
-  // Prepare the matrix multiplication computation.
-  // Each input will be convolved as a single GEMM.
-  M_ = num_output_ / group_;
-  K_ = channels_ * kernel_h_ * kernel_w_ / group_;
-  N_ = height_out_ * width_out_;
-  // The im2col result buffer will only hold one image at a time to avoid
-  // overly large memory usage. In the special case of 1x1 convolution
-  // it goes lazily unused to save memory.
+
+  // The im2col result buffer would only hold one image at a time to avoid
+  // overly large memory usage.
   col_buffer_.Reshape(
-      1, channels_ * kernel_h_ * kernel_w_, height_out_, width_out_);
+      1, channels_ * kernel_size_ * kernel_size_, height_out_, width_out_);
+
   for (int top_id = 0; top_id < top.size(); ++top_id) {
     top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
   }
-  // Set up the all ones "bias multiplier" for adding biases by BLAS
-  if (bias_term_) {
-    bias_multiplier_.Reshape(1, 1, 1, N_);
-    caffe_set(N_, Dtype(1), bias_multiplier_.mutable_cpu_data());
-  }
-*/
 }
 
 template <typename Dtype>

From 116f2f410d712d580322c57cd8249e8f98e77738 Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackcul@yahoo-inc.com>
Date: Fri, 17 Oct 2014 21:46:25 +0000
Subject: [PATCH 06/10] cleanup: rename "local weighted conv" to just "local"

---
 include/caffe/filler.hpp                      |  8 +--
 include/caffe/vision_layers.hpp               |  6 +--
 src/caffe/layer_factory.cpp                   |  2 +-
 ...eighted_conv_layer.cpp => local_layer.cpp} | 26 ++++-----
 ..._weighted_conv_layer.cu => local_layer.cu} |  6 +--
 src/caffe/proto/caffe.proto                   | 10 ++--
 ...ed_conv_layer.cpp => test_local_layer.cpp} | 54 +++++++++----------
 7 files changed, 56 insertions(+), 56 deletions(-)
 rename src/caffe/layers/{local_weighted_conv_layer.cpp => local_layer.cpp} (86%)
 rename src/caffe/layers/{local_weighted_conv_layer.cu => local_layer.cu} (93%)
 rename src/caffe/test/{test_local_weighted_conv_layer.cpp => test_local_layer.cpp} (76%)

diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 46fb60992f4..291c122fa7d 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -159,9 +159,9 @@ class XavierFiller : public Filler<Dtype> {
 
 
 template <typename Dtype>
-class TestLocalWeightConvolutionFiller : public Filler<Dtype> {
+class TestLocalFiller : public Filler<Dtype> {
  public:
-  explicit TestLocalWeightConvolutionFiller(const FillerParameter& param)
+  explicit TestLocalFiller(const FillerParameter& param)
       : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype>* blob) {
     LOG(INFO) << "Doing mutable cpu";
@@ -200,8 +200,8 @@ Filler<Dtype>* GetFiller(const FillerParameter& param) {
     return new UniformFiller<Dtype>(param);
   } else if (type == "xavier") {
     return new XavierFiller<Dtype>(param);
-  } else if (type == "test_local_weight_convolution") {
-    return new TestLocalWeightConvolutionFiller<Dtype>(param);
+  } else if (type == "test_local") {
+    return new TestLocalFiller<Dtype>(param);
   } else {
     CHECK(false) << "Unknown filler name: " << param.type();
   }
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index e8fbcaf694c..8465750af18 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -115,9 +115,9 @@ class ConvolutionLayer : public Layer<Dtype> {
 
 
 template <typename Dtype>
-class LocalWeightedConvolutionLayer : public Layer<Dtype> {
+class LocalLayer : public Layer<Dtype> {
  public:
-  explicit LocalWeightedConvolutionLayer(const LayerParameter& param)
+  explicit LocalLayer(const LayerParameter& param)
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
@@ -125,7 +125,7 @@ class LocalWeightedConvolutionLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top);
 
   virtual inline LayerParameter_LayerType type() const {
-    return LayerParameter_LayerType_LOCAL_WEIGHTED_CONVOLUTION;
+    return LayerParameter_LayerType_LOCAL;
   }
   virtual inline int MinBottomBlobs() const { return 1; }
   virtual inline int MinTopBlobs() const { return 1; }
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index a638008ce01..b17396b51b9 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -185,6 +185,6 @@ REGISTER_LAYER_CLASS(SOFTMAX_LOSS, SoftmaxWithLossLayer);
 REGISTER_LAYER_CLASS(SPLIT, SplitLayer);
 REGISTER_LAYER_CLASS(THRESHOLD, ThresholdLayer);
 REGISTER_LAYER_CLASS(WINDOW_DATA, WindowDataLayer);
-REGISTER_LAYER_CLASS(LOCAL_WEIGHTED_CONVOLUTION, LocalWeightedConvolutionLayer);
+REGISTER_LAYER_CLASS(LOCAL, LocalLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/local_weighted_conv_layer.cpp b/src/caffe/layers/local_layer.cpp
similarity index 86%
rename from src/caffe/layers/local_weighted_conv_layer.cpp
rename to src/caffe/layers/local_layer.cpp
index 9b41be14c42..a00b934174e 100644
--- a/src/caffe/layers/local_weighted_conv_layer.cpp
+++ b/src/caffe/layers/local_layer.cpp
@@ -9,19 +9,19 @@
 namespace caffe {
 
 template <typename Dtype>
-void LocalWeightedConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+void LocalLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(bottom.size(), 1) << "Conv Layer takes a single blob as input.";
   CHECK_EQ(top.size(), 1) << "Conv Layer takes a single blob as output.";
 
-  kernel_size_ = this->layer_param_.local_weighted_convolution_param().kernel_size();
-  stride_ = this->layer_param_.local_weighted_convolution_param().stride();
-  pad_ = this->layer_param_.local_weighted_convolution_param().pad();
+  kernel_size_ = this->layer_param_.local_param().kernel_size();
+  stride_ = this->layer_param_.local_param().stride();
+  pad_ = this->layer_param_.local_param().pad();
   num_ = bottom[0]->num();
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();
-  num_output_ = this->layer_param_.local_weighted_convolution_param().num_output();
+  num_output_ = this->layer_param_.local_param().num_output();
 
   height_out_ = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
   width_out_ = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
@@ -34,7 +34,7 @@ void LocalWeightedConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>
   CHECK_GE(height_, kernel_size_) << "height smaller than kernel size";
   CHECK_GE(width_, kernel_size_) << "width smaller than kernel size";
   // Set the parameters
-  bias_term_ = this->layer_param_.local_weighted_convolution_param().bias_term();
+  bias_term_ = this->layer_param_.local_param().bias_term();
 
   // Check if we need to set up the weights
   if (this->blobs_.size() > 0) {
@@ -50,20 +50,20 @@ void LocalWeightedConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>
         num_output_, 1, K_, N_));
     // fill the weights
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
-        this->layer_param_.local_weighted_convolution_param().weight_filler()));
+        this->layer_param_.local_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
     // If necessary, intiialize and fill the bias term
     if (bias_term_) {
       this->blobs_[1].reset(new Blob<Dtype>(1, 1, M_, N_));
       shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
-          this->layer_param_.local_weighted_convolution_param().bias_filler()));
+          this->layer_param_.local_param().bias_filler()));
       bias_filler->Fill(this->blobs_[1].get());  
     }
   }
 }
 
 template <typename Dtype>
-void LocalWeightedConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+void LocalLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
     " weights.";
@@ -94,7 +94,7 @@ void LocalWeightedConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& b
 }
 
 template <typename Dtype>
-void LocalWeightedConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+void LocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
 
   Dtype* x_data = col_buffer_.mutable_cpu_data();
@@ -134,7 +134,7 @@ void LocalWeightedConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*
 }
 
 template <typename Dtype>
-void LocalWeightedConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 
   const Dtype* top_diff = top[0]->cpu_diff();
@@ -203,9 +203,9 @@ void LocalWeightedConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>
 }
 
 #ifdef CPU_ONLY
-STUB_GPU(LocalWeightedConvolutionLayer);
+STUB_GPU(LocalLayer);
 #endif
 
-INSTANTIATE_CLASS(LocalWeightedConvolutionLayer);
+INSTANTIATE_CLASS(LocalLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/local_weighted_conv_layer.cu b/src/caffe/layers/local_layer.cu
similarity index 93%
rename from src/caffe/layers/local_weighted_conv_layer.cu
rename to src/caffe/layers/local_layer.cu
index 660b04fca4c..d2536424480 100644
--- a/src/caffe/layers/local_weighted_conv_layer.cu
+++ b/src/caffe/layers/local_layer.cu
@@ -11,7 +11,7 @@ namespace caffe {
 
 /// @brief refer to CPU forward -- the BLAS implementation is the same.
 template <typename Dtype>
-void LocalWeightedConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+void LocalLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
 
   Dtype* x_data = col_buffer_.mutable_gpu_data();
@@ -52,7 +52,7 @@ void LocalWeightedConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*
 
 /// @brief refer to CPU backward -- the BLAS implementation is the same.
 template <typename Dtype>
-void LocalWeightedConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void LocalLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 
   const Dtype* top_diff = top[0]->gpu_diff();
@@ -101,6 +101,6 @@ void LocalWeightedConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>
   }
 }
 
-INSTANTIATE_LAYER_GPU_FUNCS(LocalWeightedConvolutionLayer);
+INSTANTIATE_LAYER_GPU_FUNCS(LocalLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 23472365e33..8cc18a5fd20 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -221,7 +221,7 @@ message LayerParameter {
   // line above the enum. Update the next available ID when you add a new
   // LayerType.
   //
-  // LayerType next available ID: 40 (last added: LOCAL_WEIGHTED_CONVOLUTION)
+  // LayerType next available ID: 40 (last added: LOCAL)
   enum LayerType {
     // "NONE" layer type is 0th enum element so that we don't cause confusion
     // by defaulting to an existent LayerType (instead, should usually error if
@@ -248,7 +248,7 @@ message LayerParameter {
     IMAGE_DATA = 12;
     INFOGAIN_LOSS = 13;
     INNER_PRODUCT = 14;
-    LOCAL_WEIGHTED_CONVOLUTION = 39;
+    LOCAL = 39;
     LRN = 15;
     MEMORY_DATA = 29;
     MULTINOMIAL_LOGISTIC_LOSS = 16;
@@ -310,7 +310,7 @@ message LayerParameter {
   optional ImageDataParameter image_data_param = 15;
   optional InfogainLossParameter infogain_loss_param = 16;
   optional InnerProductParameter inner_product_param = 17;
-  optional LocalWeightedConvolutionParameter local_weighted_convolution_param = 102;
+  optional LocalParameter local_param = 102;
   optional LRNParameter lrn_param = 18;
   optional MemoryDataParameter memory_data_param = 22;
   optional MVNParameter mvn_param = 34;
@@ -412,8 +412,8 @@ message ConvolutionParameter {
   optional Engine engine = 15 [default = DEFAULT];
 }
 
-// Message that stores parameters used by LocalWeightedConvolutionLayer
-message LocalWeightedConvolutionParameter {
+// Message that stores parameters used by LocalLayer
+message LocalParameter {
   optional uint32 num_output = 1; // The number of outputs for the layer
   optional bool bias_term = 2 [default = true]; // whether to have bias terms
   optional uint32 pad = 3 [default = 0]; // The padding size
diff --git a/src/caffe/test/test_local_weighted_conv_layer.cpp b/src/caffe/test/test_local_layer.cpp
similarity index 76%
rename from src/caffe/test/test_local_weighted_conv_layer.cpp
rename to src/caffe/test/test_local_layer.cpp
index a0bd09676d2..d5b09d8488f 100644
--- a/src/caffe/test/test_local_weighted_conv_layer.cpp
+++ b/src/caffe/test/test_local_layer.cpp
@@ -18,9 +18,9 @@ namespace caffe {
 extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
 
 template <typename Dtype>
-class LocalWeightedConvolutionLayerTest : public ::testing::Test {
+class LocalLayerTest : public ::testing::Test {
  protected:
-	LocalWeightedConvolutionLayerTest()
+	LocalLayerTest()
       : blob_bottom_(new Blob<Dtype>()),
         blob_top_(new Blob<Dtype>()) {}
   virtual void SetUp() {
@@ -34,7 +34,7 @@ class LocalWeightedConvolutionLayerTest : public ::testing::Test {
     blob_top_vec_.push_back(blob_top_);
   }
 
-  virtual ~LocalWeightedConvolutionLayerTest() { delete blob_bottom_; delete blob_top_; }
+  virtual ~LocalLayerTest() { delete blob_bottom_; delete blob_top_; }
   Blob<Dtype>* const blob_bottom_;
   Blob<Dtype>* const blob_top_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
@@ -42,24 +42,24 @@ class LocalWeightedConvolutionLayerTest : public ::testing::Test {
 };
 
 typedef ::testing::Types<float, double> Dtypes;
-TYPED_TEST_CASE(LocalWeightedConvolutionLayerTest, Dtypes);
+TYPED_TEST_CASE(LocalLayerTest, Dtypes);
 
-TYPED_TEST(LocalWeightedConvolutionLayerTest, TestSetup) {
+TYPED_TEST(LocalLayerTest, TestSetup) {
   LayerParameter layer_param;
-  LocalWeightedConvolutionParameter* convolution_param =
-      layer_param.mutable_local_weighted_convolution_param();
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
   convolution_param->set_kernel_size(3);
   convolution_param->set_stride(2);
   convolution_param->set_num_output(4);
   shared_ptr<Layer<TypeParam> > layer(
-      new LocalWeightedConvolutionLayer<TypeParam>(layer_param));
+      new LocalLayer<TypeParam>(layer_param));
   layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), 2);
   EXPECT_EQ(this->blob_top_->channels(), 4);
   EXPECT_EQ(this->blob_top_->height(), 2);
   EXPECT_EQ(this->blob_top_->width(), 1);
   convolution_param->set_num_output(3);
-  layer.reset(new LocalWeightedConvolutionLayer<TypeParam>(layer_param));
+  layer.reset(new LocalLayer<TypeParam>(layer_param));
   layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), 2);
   EXPECT_EQ(this->blob_top_->channels(), 3);
@@ -68,24 +68,24 @@ TYPED_TEST(LocalWeightedConvolutionLayerTest, TestSetup) {
 }
 
 
-TYPED_TEST(LocalWeightedConvolutionLayerTest, TestCPUSimpleConvolution) {
+TYPED_TEST(LocalLayerTest, TestCPUSimpleConvolution) {
   // We will simply see if the convolution layer carries out averaging well.
   FillerParameter filler_param;
   filler_param.set_value(1.);
   ConstantFiller<TypeParam> filler(filler_param);
   filler.Fill(this->blob_bottom_);
   LayerParameter layer_param;
-  LocalWeightedConvolutionParameter* convolution_param =
-      layer_param.mutable_local_weighted_convolution_param();
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
   convolution_param->set_kernel_size(3);
   convolution_param->set_stride(1);
   convolution_param->set_num_output(1);
-  convolution_param->mutable_weight_filler()->set_type("test_local_weight_convolution");
+  convolution_param->mutable_weight_filler()->set_type("test_local");
   convolution_param->mutable_weight_filler()->set_value(1);
   convolution_param->mutable_bias_filler()->set_type("constant");
   convolution_param->mutable_bias_filler()->set_value(0.1);
   shared_ptr<Layer<TypeParam> > layer(
-      new LocalWeightedConvolutionLayer<TypeParam>(layer_param));
+      new LocalLayer<TypeParam>(layer_param));
   layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   Caffe::set_mode(Caffe::CPU);
   layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -104,24 +104,24 @@ TYPED_TEST(LocalWeightedConvolutionLayerTest, TestCPUSimpleConvolution) {
 }
 
 
-TYPED_TEST(LocalWeightedConvolutionLayerTest, TestGPUSimpleConvolution) {
+TYPED_TEST(LocalLayerTest, TestGPUSimpleConvolution) {
   // We will simply see if the convolution layer carries out averaging well.
   FillerParameter filler_param;
   filler_param.set_value(1.);
   ConstantFiller<TypeParam> filler(filler_param);
   filler.Fill(this->blob_bottom_);
   LayerParameter layer_param;
-  LocalWeightedConvolutionParameter* convolution_param =
-      layer_param.mutable_local_weighted_convolution_param();
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
   convolution_param->set_kernel_size(3);
   convolution_param->set_stride(2);
   convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("test_local_weight_convolution");
+  convolution_param->mutable_weight_filler()->set_type("test_local");
   convolution_param->mutable_weight_filler()->set_value(1);
   convolution_param->mutable_bias_filler()->set_type("constant");
   convolution_param->mutable_bias_filler()->set_value(0.1);
   shared_ptr<Layer<TypeParam> > layer(
-      new LocalWeightedConvolutionLayer<TypeParam>(layer_param));
+      new LocalLayer<TypeParam>(layer_param));
   layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   Caffe::set_mode(Caffe::GPU);
   layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -139,33 +139,33 @@ TYPED_TEST(LocalWeightedConvolutionLayerTest, TestGPUSimpleConvolution) {
   }
 }
 
-TYPED_TEST(LocalWeightedConvolutionLayerTest, TestCPUGradient) {
+TYPED_TEST(LocalLayerTest, TestCPUGradient) {
   LayerParameter layer_param;
-  LocalWeightedConvolutionParameter* convolution_param =
-      layer_param.mutable_local_weighted_convolution_param();
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
   convolution_param->set_kernel_size(3);
   convolution_param->set_stride(2);
   convolution_param->set_num_output(2);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
   Caffe::set_mode(Caffe::CPU);
-  LocalWeightedConvolutionLayer<TypeParam> layer(layer_param);
+  LocalLayer<TypeParam> layer(layer_param);
   GradientChecker<TypeParam> checker(1e-2, 1e-3);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_);
 }
 
-TYPED_TEST(LocalWeightedConvolutionLayerTest, TestGPUGradient) {
+TYPED_TEST(LocalLayerTest, TestGPUGradient) {
   LayerParameter layer_param;
-  LocalWeightedConvolutionParameter* convolution_param =
-      layer_param.mutable_local_weighted_convolution_param();
+  LocalParameter* convolution_param =
+      layer_param.mutable_local_param();
   convolution_param->set_kernel_size(3);
   convolution_param->set_stride(2);
   convolution_param->set_num_output(2);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
   Caffe::set_mode(Caffe::GPU);
-  LocalWeightedConvolutionLayer<TypeParam> layer(layer_param);
+  LocalLayer<TypeParam> layer(layer_param);
   GradientChecker<TypeParam> checker(1e-2, 1e-3);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_);

From fc6c6aeb86531b6bb4e40a8c21ec2924e7a68355 Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackcul@yahoo-inc.com>
Date: Fri, 17 Oct 2014 21:54:49 +0000
Subject: [PATCH 07/10] cleanup: merge gradient kernels into layer's .cu file,
 remove debugging cpu impl

---
 include/caffe/util/local_update.hpp | 26 ---------
 src/caffe/layers/local_layer.cu     | 75 ++++++++++++++++++++++++-
 src/caffe/util/local_update.cpp     | 56 -------------------
 src/caffe/util/local_update.cu      | 85 -----------------------------
 4 files changed, 74 insertions(+), 168 deletions(-)
 delete mode 100644 include/caffe/util/local_update.hpp
 delete mode 100644 src/caffe/util/local_update.cpp
 delete mode 100644 src/caffe/util/local_update.cu

diff --git a/include/caffe/util/local_update.hpp b/include/caffe/util/local_update.hpp
deleted file mode 100644
index ce9d1fd3b09..00000000000
--- a/include/caffe/util/local_update.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#ifndef _CAFFE_UTIL_LOCAL_UPDATE_HPP_
-#define _CAFFE_UTIL_LOCAL_UPDATE_HPP_
-
-namespace caffe {
-
-template <typename Dtype>
-void local_update1_cpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
-	    const int location_num, const int output_num);
-
-template <typename Dtype>
-void local_update1_gpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
-	    const int location_num, const int output_num);
-
-template <typename Dtype>
-void local_update2_cpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
-	    const int location_num, const int output_num);
-
-template <typename Dtype>
-void local_update2_gpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
-	    const int location_num, const int output_num);
-
-}  // namespace caffe
-
-#endif  // _CAFFE_UTIL_LOCAL_UPDATE_HPP_
diff --git a/src/caffe/layers/local_layer.cu b/src/caffe/layers/local_layer.cu
index d2536424480..1abd9ed4ac7 100644
--- a/src/caffe/layers/local_layer.cu
+++ b/src/caffe/layers/local_layer.cu
@@ -3,7 +3,6 @@
 #include "caffe/filler.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/util/im2col.hpp"
-#include "caffe/util/local_update.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/vision_layers.hpp"
 
@@ -101,6 +100,80 @@ void LocalLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+
+template <typename Dtype>
+__global__ void local_update1_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
+                                    Dtype* data_R, const int filter_num,
+                                    const int location_num, const int output_num) {
+  int total = filter_num * location_num * output_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num) % filter_num;
+    int q = (index / location_num) / filter_num;
+    data_R[index] += data_A[q*location_num+p] * data_B[n*location_num+p];
+  }
+}
+
+template <typename Dtype>
+void local_update1_gpu(const Dtype* data_A, const Dtype* data_B,
+                       Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is filter_num x location_num
+  // data_R is output_num x filter_num x location_num, the update performed is Rqnp += Aqp * Bnp
+
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  local_update1_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num * output_num),
+                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update1_gpu<float>(const float* data_A, const float* data_B,
+                                float* data_R, const int filter_num,
+                                const int location_num, const int output_num);
+template void local_update1_gpu<double>(const double* data_A, const double* data_B,
+                                double* data_R, const int filter_num,
+                                const int location_num, const int output_num);
+
+
+template <typename Dtype>
+__global__ void local_update2_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
+                                Dtype* data_R, const int filter_num,
+                                const int location_num, const int output_num) {
+  int total = filter_num * location_num;
+  CUDA_KERNEL_LOOP(index, total) {
+    int p = index % location_num;
+    int n = (index / location_num);
+    for (int q=0; q<output_num; q++) {
+      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
+    }
+  }
+}
+
+template <typename Dtype>
+void local_update2_gpu(const Dtype* data_A, const Dtype* data_B,
+                       Dtype* data_R, const int filter_num,
+                       const int location_num, const int output_num) {
+  // data_A is output_num x location_num
+  // data_B is output_num x filter_num x location_num
+  // data_R is filter_num x location_num, the update performed is Rnp += \sum_q(Aqp * Bqnp)
+
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  local_update2_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num),
+                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void local_update2_gpu<float>(const float* data_A, const float* data_B,
+                       float* data_R, const int filter_num,
+                       const int location_num, const int output_num);
+template void local_update2_gpu<double>(const double* data_A, const double* data_B,
+                       double* data_R, const int filter_num,
+                       const int location_num, const int output_num);
+
+
 INSTANTIATE_LAYER_GPU_FUNCS(LocalLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/util/local_update.cpp b/src/caffe/util/local_update.cpp
deleted file mode 100644
index 69e605e1110..00000000000
--- a/src/caffe/util/local_update.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-
-#include "caffe/util/local_update.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void local_update1_cpu(const Dtype* data_A, const Dtype* data_B, Dtype* data_R, const int filter_num,
-                       const int location_num, const int output_num) {
-  int total = filter_num * location_num * output_num;
-  for (int index=0; index<total; index++) {
-    int p = index % location_num;
-    int n = (index / location_num);
-    data_R[index] = Dtype(0);
-
-    for (int q=0; q<output_num; q++) {
-      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
-    }
-  }
-}
-
-// Explicit instantiation
-template void local_update1_cpu<float>(const float* data_A, const float* data_B,
-                                   float* data_R, const int filter_num,
-                                   const int location_num, const int output_num);
-template void local_update1_cpu<double>(const double* data_A, const double* data_B,
-                                   double* data_R, const int filter_num,
-                                   const int location_num, const int output_num);
-
-template <typename Dtype>
-void local_update2_cpu(const Dtype* data_A, const Dtype* data_B,
-                       Dtype* data_R, const int filter_num,
-                       const int location_num, const int output_num) {
-  int total = filter_num * location_num ;
-  for (int index=0; index<total; index++) {
-    int p = index % location_num;
-    int n = (index / location_num);
-    for (int q=0; q<output_num; q++) {
-      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
-    }
-  }
-}
-
-// Explicit instantiation
-template void local_update2_cpu<float>(const float* data_A, const float* data_B,
-                                   float* data_R, const int filter_num,
-                                   const int location_num, const int output_num);
-template void local_update2_cpu<double>(const double* data_A, const double* data_B,
-                                   double* data_R, const int filter_num,
-                                   const int location_num, const int output_num);
-
-}  // namespace caffe
diff --git a/src/caffe/util/local_update.cu b/src/caffe/util/local_update.cu
deleted file mode 100644
index f74d9bbea37..00000000000
--- a/src/caffe/util/local_update.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <algorithm>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-
-#include "caffe/common.hpp"
-#include "caffe/util/local_update.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void local_update1_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
-                                    Dtype* data_R, const int filter_num,
-                                    const int location_num, const int output_num) {
-  int total = filter_num * location_num * output_num;
-  CUDA_KERNEL_LOOP(index, total) {
-    int p = index % location_num;
-    int n = (index / location_num) % filter_num;
-    int q = (index / location_num) / filter_num;
-    data_R[index] += data_A[q*location_num+p] * data_B[n*location_num+p];
-  }
-}
-
-template <typename Dtype>
-void local_update1_gpu(const Dtype* data_A, const Dtype* data_B,
-                       Dtype* data_R, const int filter_num,
-                       const int location_num, const int output_num) {
-  // data_A is output_num x location_num
-  // data_B is filter_num x location_num
-  // data_R is output_num x filter_num x location_num, the update performed is Rqnp += Aqp * Bnp
-
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  local_update1_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num * output_num),
-                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-// Explicit instantiation
-template void local_update1_gpu<float>(const float* data_A, const float* data_B,
-                                float* data_R, const int filter_num,
-                                const int location_num, const int output_num);
-template void local_update1_gpu<double>(const double* data_A, const double* data_B,
-                                double* data_R, const int filter_num,
-                                const int location_num, const int output_num);
-
-
-template <typename Dtype>
-__global__ void local_update2_gpu_kernel(const Dtype* data_A, const Dtype* data_B,
-                                Dtype* data_R, const int filter_num,
-                                const int location_num, const int output_num) {
-  int total = filter_num * location_num;
-  CUDA_KERNEL_LOOP(index, total) {
-    int p = index % location_num;
-    int n = (index / location_num);
-    for (int q=0; q<output_num; q++) {
-      data_R[index] += data_A[q*location_num+p] * data_B[(q*filter_num+n)*location_num+p];
-    }
-  }
-}
-
-template <typename Dtype>
-void local_update2_gpu(const Dtype* data_A, const Dtype* data_B,
-                       Dtype* data_R, const int filter_num,
-                       const int location_num, const int output_num) {
-  // data_A is output_num x location_num
-  // data_B is output_num x filter_num x location_num
-  // data_R is filter_num x location_num, the update performed is Rnp += \sum_q(Aqp * Bqnp)
-
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  local_update2_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(filter_num * location_num),
-                             CAFFE_CUDA_NUM_THREADS>>>(data_A, data_B, data_R, filter_num, location_num, output_num);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-// Explicit instantiation
-template void local_update2_gpu<float>(const float* data_A, const float* data_B,
-                       float* data_R, const int filter_num,
-                       const int location_num, const int output_num);
-template void local_update2_gpu<double>(const double* data_A, const double* data_B,
-                       double* data_R, const int filter_num,
-                       const int location_num, const int output_num);
-
-}  // namespace caffe

From 215f76cbfb84cea6d0578590c4b34d5b256acaf2 Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackcul@yahoo-inc.com>
Date: Fri, 17 Oct 2014 22:01:33 +0000
Subject: [PATCH 08/10] finish renaming

---
 examples/mnist/lenet_local_train_test.prototxt   |  8 ++++----
 .../mnist_siamese_local_train_test.prototxt      | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/mnist/lenet_local_train_test.prototxt b/examples/mnist/lenet_local_train_test.prototxt
index 149cd8a53a9..ff88b5f4919 100644
--- a/examples/mnist/lenet_local_train_test.prototxt
+++ b/examples/mnist/lenet_local_train_test.prototxt
@@ -62,12 +62,12 @@ layers {
 }
 layers {
   name: "local1"
-  type: LOCAL_WEIGHTED_CONVOLUTION
+  type: LOCAL
   bottom: "pool1"
   top: "local1"
   blobs_lr: 1
   blobs_lr: 1
-  local_weighted_convolution_param {
+  local_param {
     num_output: 5
     kernel_size: 5
     stride: 1
@@ -89,12 +89,12 @@ layers {
 }
 layers {
   name: "local2"
-  type: LOCAL_WEIGHTED_CONVOLUTION
+  type: LOCAL
   bottom: "local1"
   top: "local2"
   blobs_lr: 1
   blobs_lr: 1
-  local_weighted_convolution_param {
+  local_param {
     num_output: 10
     kernel_size: 5
     stride: 1
diff --git a/examples/siamese/mnist_siamese_local_train_test.prototxt b/examples/siamese/mnist_siamese_local_train_test.prototxt
index 37df99abcbe..4212217df15 100644
--- a/examples/siamese/mnist_siamese_local_train_test.prototxt
+++ b/examples/siamese/mnist_siamese_local_train_test.prototxt
@@ -74,12 +74,12 @@ layers {
 
 layers {
   name: "local1"
-  type: LOCAL_WEIGHTED_CONVOLUTION
+  type: LOCAL
   bottom: "pool1"
   top: "local1"
   blobs_lr: 1
   blobs_lr: 2
-  local_weighted_convolution_param {
+  local_param {
     num_output: 5
     kernel_size: 5
     stride: 1
@@ -103,12 +103,12 @@ layers {
 }
 layers {
   name: "local2"
-  type: LOCAL_WEIGHTED_CONVOLUTION
+  type: LOCAL
   bottom: "local1"
   top: "local2"
   blobs_lr: 1
   blobs_lr: 2
-  local_weighted_convolution_param {
+  local_param {
     num_output: 10
     kernel_size: 5
     stride: 1
@@ -260,12 +260,12 @@ layers {
 
 layers {
   name: "local1_p"
-  type: LOCAL_WEIGHTED_CONVOLUTION
+  type: LOCAL
   bottom: "pool1_p"
   top: "local1_p"
   blobs_lr: 1
   blobs_lr: 2
-  local_weighted_convolution_param {
+  local_param {
     num_output: 5
     kernel_size: 5
     stride: 1
@@ -289,12 +289,12 @@ layers {
 }
 layers {
   name: "local2_p"
-  type: LOCAL_WEIGHTED_CONVOLUTION
+  type: LOCAL
   bottom: "local1_p"
   top: "local2_p"
   blobs_lr: 1
   blobs_lr: 2
-  local_weighted_convolution_param {
+  local_param {
     num_output: 10
     kernel_size: 5
     stride: 1

From 2d8e0c9d17bc27a12dc603cb73c517753cc36beb Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackculpepper@gmail.com>
Date: Sat, 18 Oct 2014 14:31:49 -0700
Subject: [PATCH 09/10] merge import layer into local branch

---
 .../mnist/lenet_import_conv_pool.prototxt     |  30 +++++
 examples/mnist/lenet_import_solver.prototxt   |  25 +++++
 .../mnist/lenet_import_train_test.prototxt    | 104 ++++++++++++++++++
 examples/mnist/train_lenet_import.sh          |   3 +
 include/caffe/net.hpp                         |   7 ++
 include/caffe/util/io.hpp                     |   2 +
 src/caffe/net.cpp                             |  71 +++++++++++-
 src/caffe/proto/caffe.proto                   |  18 ++-
 src/caffe/test/test_data/module.prototxt      |  21 ++++
 src/caffe/test/test_imports.cpp               |  87 +++++++++++++++
 src/caffe/util/io.cpp                         |  12 ++
 11 files changed, 378 insertions(+), 2 deletions(-)
 create mode 100644 examples/mnist/lenet_import_conv_pool.prototxt
 create mode 100644 examples/mnist/lenet_import_solver.prototxt
 create mode 100644 examples/mnist/lenet_import_train_test.prototxt
 create mode 100755 examples/mnist/train_lenet_import.sh
 create mode 100644 src/caffe/test/test_data/module.prototxt
 create mode 100644 src/caffe/test/test_imports.cpp

diff --git a/examples/mnist/lenet_import_conv_pool.prototxt b/examples/mnist/lenet_import_conv_pool.prototxt
new file mode 100644
index 00000000000..5e2b7886e22
--- /dev/null
+++ b/examples/mnist/lenet_import_conv_pool.prototxt
@@ -0,0 +1,30 @@
+layers {
+  name: "conv"
+  type: CONVOLUTION
+  bottom: "${bottom}"
+  top: "conv"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: ${num_output}
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "pool"
+  type: POOLING
+  bottom: "conv"
+  top: "pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
diff --git a/examples/mnist/lenet_import_solver.prototxt b/examples/mnist/lenet_import_solver.prototxt
new file mode 100644
index 00000000000..c332567f37b
--- /dev/null
+++ b/examples/mnist/lenet_import_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/mnist/lenet_import_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0005
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 10000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/mnist/lenet"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/examples/mnist/lenet_import_train_test.prototxt b/examples/mnist/lenet_import_train_test.prototxt
new file mode 100644
index 00000000000..4ab86e0dc49
--- /dev/null
+++ b/examples/mnist/lenet_import_train_test.prototxt
@@ -0,0 +1,104 @@
+name: "LeNet"
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_train_lmdb"
+    backend: LMDB
+    batch_size: 64
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TRAIN }
+}
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_test_lmdb"
+    backend: LMDB
+    batch_size: 100
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TEST }
+}
+layers {
+  name: "cp1"
+  type: IMPORT
+  import_param {
+    net: "examples/mnist/lenet_import_conv_pool.prototxt"
+    var { name: "bottom" value: "/data" }
+    var { name: "num_output" value: "20" }
+  }
+}
+layers {
+  name: "cp2"
+  type: IMPORT
+  import_param {
+    net: "examples/mnist/lenet_import_conv_pool.prototxt"
+    var { name: "bottom" value: "../cp1/pool" }
+    var { name: "num_output" value: "50" }
+  }
+}
+layers {
+  name: "ip1"
+  type: INNER_PRODUCT
+  bottom: "cp2/pool"
+  top: "ip1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  name: "ip2"
+  type: INNER_PRODUCT
+  bottom: "ip1"
+  top: "ip2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "accuracy"
+  type: ACCURACY
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include: { phase: TEST }
+}
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/examples/mnist/train_lenet_import.sh b/examples/mnist/train_lenet_import.sh
new file mode 100755
index 00000000000..6387228d368
--- /dev/null
+++ b/examples/mnist/train_lenet_import.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=0 GLOG_log_dir=examples/mnist/ ./build/tools/caffe train --solver=examples/mnist/lenet_import_solver.prototxt --gpu=1
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 1d06dc45533..a5229f1df34 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -182,6 +182,13 @@ class Net {
   /// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
   void GetLearningRateAndWeightDecay();
 
+  // @brief Loads imports, for modular network definitions
+  static void LoadImports(const NetParameter& source, NetParameter* target);
+  static void LoadImports(const NetParameter& source, NetParameter* target,
+      const string& pwd);
+  // @brief Resolves a layer or blob name, e.g. "../data"
+  static string ResolveImportName(const string& path, const string& pwd);
+
   /// @brief Individual layers in the net
   vector<shared_ptr<Layer<Dtype> > > layers_;
   vector<string> layer_names_;
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index e518979a75b..9ca84d1582d 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -52,6 +52,8 @@ inline void MakeTempDir(string* temp_dirname) {
   delete temp_dirname_cstr;
 }
 
+string ReadFile(const string& filename);
+
 bool ReadProtoFromTextFile(const char* filename, Message* proto);
 
 inline bool ReadProtoFromTextFile(const string& filename, Message* proto) {
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 21ab15fd31b..b011676f979 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -1,3 +1,6 @@
+#include <boost/algorithm/string.hpp>
+#include <google/protobuf/text_format.h>
+
 #include <algorithm>
 #include <map>
 #include <set>
@@ -16,6 +19,8 @@
 
 #include "caffe/test/test_caffe_main.hpp"
 
+using boost::replace_all;
+
 namespace caffe {
 
 template <typename Dtype>
@@ -32,10 +37,14 @@ Net<Dtype>::Net(const string& param_file) {
 
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
+  // Load import layers
+  NetParameter expanded(in_param);
+  LoadImports(in_param, &expanded);
+
   // Filter layers based on their include/exclude rules and
   // the current NetState.
   NetParameter filtered_param;
-  FilterNet(in_param, &filtered_param);
+  FilterNet(expanded, &filtered_param);
   LOG(INFO) << "Initializing net from parameters: " << std::endl
             << filtered_param.DebugString();
   // Create a copy of filtered_param with splits added where necessary.
@@ -462,6 +471,66 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
   }
 }
 
+
+template <typename Dtype>
+void Net<Dtype>::LoadImports(const NetParameter& source, NetParameter* target) {
+  target->CopyFrom(source);
+  target->clear_layers();
+  LoadImports(source, target, "");
+}
+
+template <typename Dtype>
+void Net<Dtype>::LoadImports(const NetParameter& source, NetParameter* target,
+    const string& pwd) {
+  for (int i = 0; i < source.layers_size(); ++i) {
+    if (source.layers(i).type() == LayerParameter_LayerType_IMPORT) {
+      const LayerParameter& layer = source.layers(i);
+      CHECK(layer.has_import_param()) << "Missing import_param";
+      const ImportParameter& import = layer.import_param();
+      string proto = ReadFile(import.net());
+      // Replace variables and references
+      for (int j = 0; j < import.var_size(); ++j) {
+        const Pair& p = import.var(j);
+        replace_all(proto, "${" + p.name() + "}", p.value());
+      }
+      NetParameter net;
+      bool parse = google::protobuf::TextFormat::ParseFromString(proto, &net);
+      CHECK(parse) << "Failed to parse NetParameter file: " << import.net();
+      CHECK(layer.has_name() && layer.name().length() > 0)
+          << "Import layer must have a name";
+      LoadImports(net, target, ResolveImportName(layer.name(), pwd));
+    } else {
+      LayerParameter *t = target->add_layers();
+      t->CopyFrom(source.layers(i));
+      t->set_name(ResolveImportName(t->name(), pwd));
+      for (int j = 0; j < source.layers(i).top_size(); ++j)
+        t->set_top(j, ResolveImportName(source.layers(i).top(j), pwd));
+      for (int j = 0; j < source.layers(i).bottom_size(); ++j)
+        t->set_bottom(j, ResolveImportName(source.layers(i).bottom(j), pwd));
+    }
+  }
+}
+
+template <typename Dtype>
+string Net<Dtype>::ResolveImportName(const string& path, const string& pwd) {
+  CHECK(!boost::starts_with(pwd, "/") && !boost::ends_with(pwd, "/"));
+  if (boost::starts_with(path, "/"))
+    return path.substr(1, path.size() - 1);
+  string cpath = path;
+  string cpwd = pwd;
+  while (boost::starts_with(cpath, "../")) {
+    cpath = cpath.substr(3, cpath.size() - 3);
+    size_t i = cpwd.find_last_of('/');
+    cpwd = i == string::npos ? "" : cpwd.substr(0, i);
+  }
+  if (!cpwd.size())
+    return cpath;
+  if (!cpath.size() || cpath == ".")
+    return cpwd;
+  return cpwd + '/' + cpath;
+}
+
+
 template <typename Dtype>
 void Net<Dtype>::GetLearningRateAndWeightDecay() {
   LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 8cc18a5fd20..f5bd5ca6928 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -246,9 +246,10 @@ message LayerParameter {
     HINGE_LOSS = 28;
     IM2COL = 11;
     IMAGE_DATA = 12;
+    IMPORT = 51;
     INFOGAIN_LOSS = 13;
     INNER_PRODUCT = 14;
-    LOCAL = 39;
+    LOCAL = 50;
     LRN = 15;
     MEMORY_DATA = 29;
     MULTINOMIAL_LOGISTIC_LOSS = 16;
@@ -308,6 +309,7 @@ message LayerParameter {
   optional HDF5OutputParameter hdf5_output_param = 14;
   optional HingeLossParameter hinge_loss_param = 29;
   optional ImageDataParameter image_data_param = 15;
+  optional ImportParameter import_param = 103;
   optional InfogainLossParameter infogain_loss_param = 16;
   optional InnerProductParameter inner_product_param = 17;
   optional LocalParameter local_param = 102;
@@ -553,6 +555,20 @@ message ImageDataParameter {
   optional bool mirror = 6 [default = false];
 }
 
+message Pair {
+  required string name = 1;
+  required string value = 2;
+}
+
+// Message that stores parameters used by ImportLayer
+message ImportParameter {
+  // Proto file to import
+  required string net = 1;
+  // Variable names to replace before importing the file. Variables can
+  // be used in the file in this format: ${name}
+  repeated Pair var = 2;
+}
+
 // Message that stores parameters InfogainLossLayer
 message InfogainLossParameter {
   // Specify the infogain matrix source.
diff --git a/src/caffe/test/test_data/module.prototxt b/src/caffe/test/test_data/module.prototxt
new file mode 100644
index 00000000000..6c2d5359360
--- /dev/null
+++ b/src/caffe/test/test_data/module.prototxt
@@ -0,0 +1,21 @@
+layers: {
+  name: 'innerproduct'
+  type: INNER_PRODUCT
+  inner_product_param {
+    num_output: ${num_output}
+    weight_filler {
+      type: 'gaussian'
+      std: 0.01
+    }
+    bias_filler {
+      type: 'constant'
+      value: 0
+    }
+  }
+  blobs_lr: 1.
+  blobs_lr: 2.
+  weight_decay: 1.
+  weight_decay: 0.
+  bottom: '../data'
+  top: 'innerproduct'
+}
diff --git a/src/caffe/test/test_imports.cpp b/src/caffe/test/test_imports.cpp
new file mode 100644
index 00000000000..023ae6dce7f
--- /dev/null
+++ b/src/caffe/test/test_imports.cpp
@@ -0,0 +1,87 @@
+#include <map>
+#include <string>
+#include <vector>
+
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/io.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class ImportsTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  virtual void InitNetFromProtoString(const string& proto) {
+    NetParameter param;
+    CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
+    net_.reset(new Net<Dtype>(param));
+  }
+
+  virtual void InitNet() {
+    string file = CMAKE_SOURCE_DIR "caffe/test/test_data/module.prototxt";
+    string proto =
+        "name: 'TestNetwork' "
+        "layers: { "
+        "  name: 'data' "
+        "  type: DUMMY_DATA "
+        "  dummy_data_param { "
+        "    num: 5 "
+        "    channels: 2 "
+        "    height: 3 "
+        "    width: 4 "
+        "    num: 5 "
+        "    channels: 1 "
+        "    height: 1 "
+        "    width: 1 "
+        "    data_filler { "
+        "      type: 'gaussian' "
+        "      std: 0.01 "
+        "    } "
+        "  } "
+        "  top: 'data' "
+        "  top: 'label' "
+        "} "
+        "layers: { "
+        "  name: 'import' "
+        "  type: IMPORT "
+        "  import_param { "
+        "    net: '" + file + "' "
+        "    var { name: 'num_output' value: '1000' } "
+        "  } "
+        "} "
+        "layers: { "
+        "  name: 'loss' "
+        "  type: SOFTMAX_LOSS "
+        "  bottom: 'import/innerproduct' "
+        "  bottom: 'label' "
+        "  top: 'top_loss' "
+        "} ";
+    InitNetFromProtoString(proto);
+  }
+
+  shared_ptr<Net<Dtype> > net_;
+};
+
+TYPED_TEST_CASE(ImportsTest, TestDtypesAndDevices);
+
+TYPED_TEST(ImportsTest, ConvPool) {
+  this->InitNet();
+  EXPECT_TRUE(this->net_->has_blob("data"));
+  EXPECT_TRUE(this->net_->has_blob("label"));
+  EXPECT_TRUE(this->net_->has_blob("import/innerproduct"));
+  EXPECT_FALSE(this->net_->has_blob("loss"));
+}
+}  // namespace caffe
+
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 36510d61d40..09d4472c7bc 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -28,6 +28,18 @@ using google::protobuf::io::ZeroCopyOutputStream;
 using google::protobuf::io::CodedOutputStream;
 using google::protobuf::Message;
 
+std::string ReadFile(const string& filename) {
+  std::ifstream in(filename.c_str(), std::ios::in | std::ios::binary);
+  CHECK(in) << "Failed to read file: " << filename;
+  std::string contents;
+  in.seekg(0, std::ios::end);
+  contents.resize(in.tellg());
+  in.seekg(0, std::ios::beg);
+  in.read(&contents[0], contents.size());
+  in.close();
+  return contents;
+}
+
 bool ReadProtoFromTextFile(const char* filename, Message* proto) {
   int fd = open(filename, O_RDONLY);
   CHECK_NE(fd, -1) << "File not found: " << filename;

From 647f91e8b0cdc2ee5c348c039ee3e268e94d2f1a Mon Sep 17 00:00:00 2001
From: Jack Culpepper <jackculpepper@gmail.com>
Date: Sat, 18 Oct 2014 15:14:23 -0700
Subject: [PATCH 10/10] compute same label from label pair

---
 include/caffe/loss_layers.hpp         | 10 ++++++++++
 src/caffe/layers/hinge_loss_layer.cpp | 24 ++++++++++++++++++++----
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 9fe58cd97bc..7be24fd56e2 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -355,6 +355,13 @@ class HingeLossLayer : public LossLayer<Dtype> {
     return LayerParameter_LayerType_HINGE_LOSS;
   }
 
+  // HingeLossLayer takes 2-3 bottom Blobs; if there are 3 the second and third
+  // are compared to compute a 0/1 label.  (Otherwise the label comes directly
+  // from the second.)
+  virtual inline int ExactNumBottomBlobs() const { return -1; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 3; }
+
  protected:
   /// @copydoc HingeLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
@@ -389,6 +396,9 @@ class HingeLossLayer : public LossLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ private:
+  int ComputeLabel(const vector<Blob<Dtype>*>& bottom, int i);
 };
 
 /**
diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp
index f09916e2556..e7a3d5387c5 100644
--- a/src/caffe/layers/hinge_loss_layer.cpp
+++ b/src/caffe/layers/hinge_loss_layer.cpp
@@ -10,19 +10,34 @@
 
 namespace caffe {
 
+template <typename Dtype>
+int HingeLossLayer<Dtype>::ComputeLabel(const vector<Blob<Dtype>*>& bottom, int i) {
+  int label;
+  if (bottom.size() == 2) {
+    label = static_cast<int>(bottom[1]->cpu_data()[i]);
+  } else {  // bottom.size() == 3
+    // label == 1 if bottom[1] == bottom[2] (same)
+    // label == 0 if bottom[1] != bottom[2] (not same)
+    label = (bottom[1]->cpu_data()[i] ==
+             bottom[2]->cpu_data()[i]) ? 1 : 0;
+  }
+  return label;
+}
+
 template <typename Dtype>
 void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  const Dtype* label = bottom[1]->cpu_data();
   int num = bottom[0]->num();
   int count = bottom[0]->count();
   int dim = count / num;
+  int label;
 
   caffe_copy(count, bottom_data, bottom_diff);
   for (int i = 0; i < num; ++i) {
-    bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
+    label = ComputeLabel(bottom, i);
+    bottom_diff[i * dim + label] *= -1;
   }
   for (int i = 0; i < num; ++i) {
     for (int j = 0; j < dim; ++j) {
@@ -52,13 +67,14 @@ void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
   if (propagate_down[0]) {
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const Dtype* label = bottom[1]->cpu_data();
     int num = bottom[0]->num();
     int count = bottom[0]->count();
     int dim = count / num;
+    int label;
 
     for (int i = 0; i < num; ++i) {
-      bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
+      label = ComputeLabel(bottom, i);
+      bottom_diff[i * dim + label] *= -1;
     }
 
     const Dtype loss_weight = top[0]->cpu_diff()[0];