diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37f937fe489..ef599b68922 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,16 +16,13 @@ include(cmake/ConfigGen.cmake)
 
 # ---[ Options
 caffe_option(CPU_ONLY  "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
-caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
+caffe_option(USE_CUDNN "Build Caffe with cuDNN libary support" ON IF NOT CPU_ONLY)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
-set(python_version "2" CACHE STRING "Specify which Python version to use")
+set(python_version "2" CACHE STRING "Specify which python version to use")
 caffe_option(BUILD_matlab "Build Matlab wrapper" OFF IF UNIX OR APPLE)
 caffe_option(BUILD_docs   "Build documentation" ON IF UNIX OR APPLE)
-caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON)
-caffe_option(USE_LMDB "Build with lmdb" ON)
-caffe_option(USE_LEVELDB "Build with levelDB" ON)
-caffe_option(USE_OPENCV "Build with OpenCV support" ON)
+caffe_option(BUILD_python_layer "Build the Caffe python layer" ON)
 
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
diff --git a/Makefile b/Makefile
index 5fb6394e947..287fa4ea52f 100644
--- a/Makefile
+++ b/Makefile
@@ -169,23 +169,9 @@ ifneq ($(CPU_ONLY), 1)
 	LIBRARY_DIRS += $(CUDA_LIB_DIR)
 	LIBRARIES := cudart cublas curand
 endif
-
-LIBRARIES += glog gflags protobuf boost_system m hdf5_hl hdf5
-
-# handle IO dependencies
-USE_LEVELDB ?= 1
-USE_LMDB ?= 1
-USE_OPENCV ?= 1
-
-ifeq ($(USE_LEVELDB), 1)
-	LIBRARIES += leveldb snappy
-endif
-ifeq ($(USE_LMDB), 1)
-	LIBRARIES += lmdb
-endif
-ifeq ($(USE_OPENCV), 1)
-	LIBRARIES += opencv_core opencv_highgui opencv_imgproc
-endif
+LIBRARIES += glog gflags protobuf leveldb snappy \
+	lmdb boost_system hdf5_hl hdf5 m \
+	opencv_core opencv_highgui opencv_imgproc #opencv_imgcodecs
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall -Wno-sign-compare
 
@@ -304,17 +290,6 @@ ifeq ($(USE_CUDNN), 1)
 	COMMON_FLAGS += -DUSE_CUDNN
 endif
 
-# configure IO libraries
-ifeq ($(USE_OPENCV), 1)
-	COMMON_FLAGS += -DUSE_OPENCV
-endif
-ifeq ($(USE_LEVELDB), 1)
-	COMMON_FLAGS += -DUSE_LEVELDB
-endif
-ifeq ($(USE_LMDB), 1)
-	COMMON_FLAGS += -DUSE_LMDB
-endif
-
 # CPU-only configuration
 ifeq ($(CPU_ONLY), 1)
 	OBJS := $(PROTO_OBJS) $(CXX_OBJS)
@@ -354,9 +329,8 @@ else
 		# OS X packages atlas as the vecLib framework
 		LIBRARIES += cblas
 		# 10.10 has accelerate while 10.9 has veclib
-		XCODE_CLT_VER := $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables | grep 'version' | sed 's/[^0-9]*\([0-9]\).*/\1/')
-		XCODE_CLT_GEQ_6 := $(shell [ $(XCODE_CLT_VER) -gt 5 ] && echo 1)
-		ifeq ($(XCODE_CLT_GEQ_6), 1)
+		XCODE_CLT_VER := $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables | grep -o 'version: 6')
+		ifneq (,$(findstring version: 6,$(XCODE_CLT_VER)))
 			BLAS_INCLUDE ?= /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
 			LDFLAGS += -framework Accelerate
 		else
@@ -498,7 +472,7 @@ runtest: $(TEST_ALL_BIN)
 
 pytest: py
 	cd python; python -m unittest discover -s caffe/test
-
+	
 mattest: mat
 	cd matlab; $(MATLAB_DIR)/bin/matlab -nodisplay -r 'caffe.run_tests(), exit()'
 
diff --git a/Makefile.config.example b/Makefile.config.example
index a20bad2f5ce..a873502559f 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -7,11 +7,6 @@
 # CPU-only switch (uncomment to build without GPU support).
 # CPU_ONLY := 1
 
-# uncomment to disable IO dependencies and corresponding data layers
-# USE_LEVELDB := 0
-# USE_LMDB := 0
-# USE_OPENCV := 0
-
 # To customize your choice of compiler, uncomment and set the following.
 # N.B. the default for Linux is g++ and the default for OSX is clang++
 # CUSTOM_CXX := g++
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 8b259965359..566d6ca0aa7 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -56,18 +56,6 @@ function(caffe_generate_export_configs)
     list(APPEND Caffe_DEFINITIONS -DCPU_ONLY)
   endif()
 
-  if(USE_OPENCV)
-    list(APPEND Caffe_DEFINITIONS -DUSE_OPENCV)
-  endif()
-
-  if(USE_LMDB)
-    list(APPEND Caffe_DEFINITIONS -DUSE_LMDB)
-  endif()
-
-  if(USE_LEVELDB)
-    list(APPEND Caffe_DEFINITIONS -DUSE_LEVELDB)
-  endif()
-
   if(NOT HAVE_CUDNN)
     set(HAVE_CUDNN FALSE)
   else()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index d68d7bfba66..7c86dd55a30 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -29,27 +29,19 @@ include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
 list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
 
 # ---[ LMDB
-if(USE_LMDB)
-  find_package(LMDB REQUIRED)
-  include_directories(SYSTEM ${LMDB_INCLUDE_DIR})
-  list(APPEND Caffe_LINKER_LIBS ${LMDB_LIBRARIES})
-  add_definitions(-DUSE_LMDB)
-endif()
+find_package(LMDB REQUIRED)
+include_directories(SYSTEM ${LMDB_INCLUDE_DIR})
+list(APPEND Caffe_LINKER_LIBS ${LMDB_LIBRARIES})
 
 # ---[ LevelDB
-if(USE_LEVELDB)
-  find_package(LevelDB REQUIRED)
-  include_directories(SYSTEM ${LevelDB_INCLUDE})
-  list(APPEND Caffe_LINKER_LIBS ${LevelDB_LIBRARIES})
-  add_definitions(-DUSE_LEVELDB)
-endif()
+find_package(LevelDB REQUIRED)
+include_directories(SYSTEM ${LevelDB_INCLUDE})
+list(APPEND Caffe_LINKER_LIBS ${LevelDB_LIBRARIES})
 
 # ---[ Snappy
-if(USE_LEVELDB)
-  find_package(Snappy REQUIRED)
-  include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
-  list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
-endif()
+find_package(Snappy REQUIRED)
+include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
+list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
 
 # ---[ CUDA
 include(cmake/Cuda.cmake)
@@ -65,16 +57,13 @@ if(NOT HAVE_CUDA)
 endif()
 
 # ---[ OpenCV
-if(USE_OPENCV)
-  find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
-  if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
-    find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
-  endif()
-  include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-  list(APPEND Caffe_LINKER_LIBS ${OpenCV_LIBS})
-  message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
-  add_definitions(-DUSE_OPENCV)
+find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
+if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
+  find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
 endif()
+include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS ${OpenCV_LIBS})
+message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
 
 # ---[ BLAS
 if(NOT APPLE)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 3d12e81a130..e094ac0040e 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -114,9 +114,6 @@ function(caffe_print_configuration_summary)
   caffe_status("  BUILD_matlab      :   ${BUILD_matlab}")
   caffe_status("  BUILD_docs        :   ${BUILD_docs}")
   caffe_status("  CPU_ONLY          :   ${CPU_ONLY}")
-  caffe_status("  USE_LMDB          :   ${USE_LMDB}")
-  caffe_status("  USE_LEVELDB       :   ${USE_LEVELDB}")
-  caffe_status("  USE_OPENCV        :   ${USE_OPENCV}")
   caffe_status("")
   caffe_status("Dependencies:")
   caffe_status("  BLAS              : " APPLE THEN "Yes (vecLib)" ELSE "Yes (${BLAS})")
@@ -124,16 +121,10 @@ function(caffe_print_configuration_summary)
   caffe_status("  glog              :   Yes")
   caffe_status("  gflags            :   Yes")
   caffe_status("  protobuf          : " PROTOBUF_FOUND THEN "Yes (ver. ${PROTOBUF_VERSION})" ELSE "No" )
-  if(USE_LMDB)
-    caffe_status("  lmdb              : " LMDB_FOUND THEN "Yes (ver. ${LMDB_VERSION})" ELSE "No")
-  endif()
-  if(USE_LEVELDB)
-    caffe_status("  LevelDB           : " LEVELDB_FOUND THEN  "Yes (ver. ${LEVELDB_VERSION})" ELSE "No")
-    caffe_status("  Snappy            : " SNAPPY_FOUND THEN "Yes (ver. ${Snappy_VERSION})" ELSE "No" )
-  endif()
-  if(USE_OPENCV)
-    caffe_status("  OpenCV            :   Yes (ver. ${OpenCV_VERSION})")
-  endif()
+  caffe_status("  lmdb              : " LMDB_FOUND THEN "Yes (ver. ${LMDB_VERSION})" ELSE "No")
+  caffe_status("  Snappy            : " SNAPPY_FOUND THEN "Yes (ver. ${Snappy_VERSION})" ELSE "No" )
+  caffe_status("  LevelDB           : " LEVELDB_FOUND THEN  "Yes (ver. ${LEVELDB_VERSION})" ELSE "No")
+  caffe_status("  OpenCV            :   Yes (ver. ${OpenCV_VERSION})")
   caffe_status("  CUDA              : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
   caffe_status("")
   if(HAVE_CUDA)
@@ -174,3 +165,4 @@ function(caffe_print_configuration_summary)
   caffe_status("  Install path      :   ${CMAKE_INSTALL_PREFIX}")
   caffe_status("")
 endfunction()
+
diff --git a/cmake/Templates/CaffeConfig.cmake.in b/cmake/Templates/CaffeConfig.cmake.in
index 73f57ac2d74..8f23742e52e 100644
--- a/cmake/Templates/CaffeConfig.cmake.in
+++ b/cmake/Templates/CaffeConfig.cmake.in
@@ -17,24 +17,22 @@
 #   Caffe_HAVE_CUDNN   - signals about cuDNN support
 
 
-# OpenCV dependency (optional)
+# OpenCV dependency
 
-if(@USE_OPENCV@)
-  if(NOT OpenCV_FOUND)
-    set(Caffe_OpenCV_CONFIG_PATH "@OpenCV_CONFIG_PATH@")
-    if(Caffe_OpenCV_CONFIG_PATH)
-      get_filename_component(Caffe_OpenCV_CONFIG_PATH ${Caffe_OpenCV_CONFIG_PATH} ABSOLUTE)
+if(NOT OpenCV_FOUND)
+  set(Caffe_OpenCV_CONFIG_PATH "@OpenCV_CONFIG_PATH@")
+  if(Caffe_OpenCV_CONFIG_PATH)
+    get_filename_component(Caffe_OpenCV_CONFIG_PATH ${Caffe_OpenCV_CONFIG_PATH} ABSOLUTE)
 
-      if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core)
-        message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}")
-        include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake)
-      endif()
-
-    else()
-      find_package(OpenCV REQUIRED)
+    if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core)
+      message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}")
+      include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake)
     endif()
-    unset(Caffe_OpenCV_CONFIG_PATH)
+
+  else()
+    find_package(OpenCV REQUIRED)
   endif()
+  unset(Caffe_OpenCV_CONFIG_PATH)
 endif()
 
 # Compute paths
diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 9302022d7da..6039e8f6b21 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -30,8 +30,3 @@
 
 /* Matlab */
 #cmakedefine HAVE_MATLAB
-
-/* IO libraries */
-#cmakedefine USE_OPENCV
-#cmakedefine USE_LMDB
-#cmakedefine USE_LEVELDB
diff --git a/docs/installation.md b/docs/installation.md
index 89a8c71c71a..d535c6d093d 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -17,19 +17,16 @@ When updating Caffe, it's best to `make clean` before re-compiling.
 
 ## Prerequisites
 
-Caffe has several dependencies:
+Caffe has several dependencies.
 
 * [CUDA](https://developer.nvidia.com/cuda-zone) is required for GPU mode.
     * library version 7.0 and the latest driver version are recommended, but 6.* is fine too
     * 5.5, and 5.0 are compatible but considered legacy
 * [BLAS](http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) via ATLAS, MKL, or OpenBLAS.
 * [Boost](http://www.boost.org/) >= 1.55
-* `protobuf`, `glog`, `gflags`, `hdf5`
-
-Optional dependencies:
-
 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
-* IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
+* `protobuf`, `glog`, `gflags`
+* IO libraries `hdf5`, `leveldb`, `snappy`, `lmdb`
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
diff --git a/docs/multigpu.md b/docs/multigpu.md
deleted file mode 100644
index 01cfb8938b5..00000000000
--- a/docs/multigpu.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-title: Multi-GPU Usage, Hardware Configuration Assumptions, and Performance
----
-
-# Multi-GPU Usage
-
-Currently Multi-GPU is only supported via the C/C++ paths and only for training.
-
-The GPUs to be used for training can be set with the "-gpu" flag on the command line to the 'caffe' tool.  e.g. "build/tools/caffe train --solver=models/bvlc_alexnet/solver.prototxt --gpu=0,1" will train on GPUs 0 and 1.
-
-**NOTE**: each GPU runs the batchsize specified in your train_val.prototxt.  So if you go from 1 GPU to 2 GPU, your effective batchsize will double.  e.g. if your train_val.prototxt specified a batchsize of 256, if you run 2 GPUs your effective batch size is now 512.  So you need to adjust the batchsize when running multiple GPUs and/or adjust your solver params, specifically learning rate.
-
-# Hardware Configuration Assumptions
-
-The current implementation uses a tree reduction strategy.  e.g. if there are 4 GPUs in the system, 0:1, 2:3 will exchange gradients, then 0:2 (top of the tree) will exchange gradients, 0 will calculate
-updated model, 0\-\>2, and then 0\-\>1, 2\-\>3. 
-
-For best performance, P2P DMA access between devices is needed. Without P2P access, for example crossing PCIe root complex, data is copied through host and effective exchange bandwidth is greatly reduced.
-
-Current implementation has a "soft" assumption that the devices being used are homogeneous.  In practice, any devices of the same general class should work together, but performance and total size is limited by the smallest device being used.  e.g. if you combine a TitanX and a GTX980, peformance will be limited by the 980.  Mixing vastly different levels of boards, e.g. Kepler and Fermi, is not supported.
-
-"nvidia-smi topo -m" will show you the connectivity matrix.  You can do P2P through PCIe bridges, but not across socket level links at this time, e.g. across CPU sockets on a multi-socket motherboard.
-
-# Scaling Performance
-
-Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs you will see 3.5x scaling or so.  With "strong scaling", the system can become communication bound, especially with layer performance optimizations like those in [cuDNNv3](http://nvidia.com/cudnn), and you will likely see closer to mid 2.x scaling in performance.  Networks that have heavy computation compared to the number of parameters tend to have the best scaling performance.
\ No newline at end of file
diff --git a/examples/cpp_classification/classification.cpp b/examples/cpp_classification/classification.cpp
index de48fb692c8..dc8b863f53f 100644
--- a/examples/cpp_classification/classification.cpp
+++ b/examples/cpp_classification/classification.cpp
@@ -1,9 +1,7 @@
 #include <caffe/caffe.hpp>
-#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
-#endif  // USE_OPENCV
 #include <algorithm>
 #include <iosfwd>
 #include <memory>
@@ -11,7 +9,6 @@
 #include <utility>
 #include <vector>
 
-#ifdef USE_OPENCV
 using namespace caffe;  // NOLINT(build/namespaces)
 using std::string;
 
@@ -258,8 +255,3 @@ int main(int argc, char** argv) {
               << p.first << "\"" << std::endl;
   }
 }
-#else
-int main(int argc, char** argv) {
-  LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV.";
-}
-#endif  // USE_OPENCV
diff --git a/examples/mnist/convert_mnist_data.cpp b/examples/mnist/convert_mnist_data.cpp
index 8f29bafde85..54443f11dd3 100644
--- a/examples/mnist/convert_mnist_data.cpp
+++ b/examples/mnist/convert_mnist_data.cpp
@@ -9,13 +9,9 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <google/protobuf/text_format.h>
-
-#if defined(USE_LEVELDB) && defined(USE_LMDB)
 #include <leveldb/db.h>
 #include <leveldb/write_batch.h>
 #include <lmdb.h>
-#endif
-
 #include <stdint.h>
 #include <sys/stat.h>
 
@@ -24,8 +20,6 @@
 
 #include "caffe/proto/caffe.pb.h"
 
-#if defined(USE_LEVELDB) && defined(USE_LMDB)
-
 using namespace caffe;  // NOLINT(build/namespaces)
 using std::string;
 
@@ -202,9 +196,3 @@ int main(int argc, char** argv) {
   }
   return 0;
 }
-#else
-int main(int argc, char** argv) {
-  LOG(FATAL) << "This example requires LevelDB and LMDB; " <<
-  "compile with USE_LEVELDB and USE_LMDB.";
-}
-#endif  // USE_LEVELDB and USE_LMDB
diff --git a/examples/siamese/convert_mnist_siamese_data.cpp b/examples/siamese/convert_mnist_siamese_data.cpp
index ad08036fb08..8008b4439c5 100644
--- a/examples/siamese/convert_mnist_siamese_data.cpp
+++ b/examples/siamese/convert_mnist_siamese_data.cpp
@@ -10,14 +10,12 @@
 
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
+#include "leveldb/db.h"
 #include "stdint.h"
 
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/math_functions.hpp"
 
-#ifdef USE_LEVELDB
-#include "leveldb/db.h"
-
 uint32_t swap_endian(uint32_t val) {
     val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
     return (val << 16) | (val >> 16);
@@ -123,8 +121,3 @@ int main(int argc, char** argv) {
   }
   return 0;
 }
-#else
-int main(int argc, char** argv) {
-  LOG(FATAL) << "This example requires LevelDB; compile with USE_LEVELDB.";
-}
-#endif  // USE_LEVELDB
diff --git a/examples/triplet/3d_triplet.prototxt b/examples/triplet/3d_triplet.prototxt
new file mode 100644
index 00000000000..076e0be5040
--- /dev/null
+++ b/examples/triplet/3d_triplet.prototxt
@@ -0,0 +1,110 @@
+name: "3d_triplet"
+input: "data"
+input_dim: 9720
+input_dim: 1
+input_dim: 64
+input_dim: 64
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 8
+    stride: 1
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "pool1"
+  top: "pool1"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 7
+    kernel_size: 5
+    stride: 1
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "pool2"
+  top: "pool2"
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool2"
+  top: "ip1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 256
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "ip1"
+  top: "ip1"
+}
+layer {
+  name: "feat"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "feat"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 3
+  }
+}
diff --git a/examples/triplet/3d_triplet_solver.prototxt b/examples/triplet/3d_triplet_solver.prototxt
new file mode 100644
index 00000000000..eea97da7603
--- /dev/null
+++ b/examples/triplet/3d_triplet_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/triplet/3d_triplet_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of 3d database, we have test batch size 250 and 250 test iterations: 50*(2+3)=250,
+# covering the full 9720 testing images:162*6*10=9720.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 100
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.001
+momentum: 0.9
+weight_decay: 0.0000
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 80000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/triplet/3d_triplet"
+# solver mode: CPU or GPU
+solver_mode: CPU
diff --git a/examples/triplet/3d_triplet_train_test.prototxt b/examples/triplet/3d_triplet_train_test.prototxt
new file mode 100644
index 00000000000..60637b1a66b
--- /dev/null
+++ b/examples/triplet/3d_triplet_train_test.prototxt
@@ -0,0 +1,181 @@
+name: "3d_triplet_train_test"
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "sim"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  data_param {
+    source: "examples/triplet/3d_triplet_train_leveldb"
+    batch_size: 250
+  }
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "sim"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  data_param {
+    source: "examples/triplet/3d_triplet_test_leveldb"
+    batch_size: 250
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    name: "conv1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv1_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 8
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "pool1"
+  top: "pool1"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    name: "conv2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv2_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 7
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "pool2"
+  top: "pool2"
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool2"
+  top: "ip1"
+  param {
+    name: "ip1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip1_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 256
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "ip1"
+  top: "ip1"
+}
+layer {
+  name: "feat"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "feat"
+  param {
+    name: "feat_w"
+    lr_mult: 1
+  }
+  param {
+    name: "feat_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "TripletLoss"
+  bottom: "feat"
+  bottom: "sim"
+  top: "loss"
+  triplet_loss_param {
+    margin: 1
+    losstype: 0
+    num_triplets: 3
+  }
+}
diff --git a/examples/triplet/convert_3d_triplet_data.cpp b/examples/triplet/convert_3d_triplet_data.cpp
new file mode 100644
index 00000000000..943efd9965f
--- /dev/null
+++ b/examples/triplet/convert_3d_triplet_data.cpp
@@ -0,0 +1,221 @@
+// Usage:
+// convert_3d_data input_image_file input_label_file output_db_file
+#include <fstream>  // NOLINT(readability/streams)
+#include <string>
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/math_functions.hpp"
+#include "glog/logging.h"
+#include "google/protobuf/text_format.h"
+#include "leveldb/db.h"
+#include "math.h"
+#include "stdint.h"
+
+uint32_t swap_endian(uint32_t val) {
+    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
+    return (val << 16) | (val >> 16);
+}
+
+void read_image(std::ifstream* image_file, std::ifstream* label_file,
+        uint32_t index, uint32_t rows, uint32_t cols,
+        char* pixels, char* label_temp, signed char* label, int rgb_use) {
+  if (rgb_use == 0) {
+    image_file->seekg(index * rows * cols + 16);
+    image_file->read(pixels, rows * cols);
+    label_file->seekg(index * 4 + 8);
+    label_file->read(label_temp, 4);
+    for (int i = 0; i < 4; i++)
+      *(label+i) = (signed char)*(label_temp+i);
+  } else {
+    image_file->seekg(3 * index * rows * cols + 16);
+    image_file->read(pixels, 3 * rows * cols);
+    label_file->seekg(index * 4 + 8);
+    label_file->read(label_temp, 4);
+    for (int i = 0; i < 4; i++)
+      *(label+i) = (signed char)*(label_temp+i);
+  }
+}
+
+void convert_dataset(const char* image_filename, const char* label_filename,
+        const char* db_filename,
+                     const char* class_number, const char* rgb_use) {
+  int rgb_use1 = atoi(rgb_use);
+  int class_num = atoi(class_number);
+  // Open files
+  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
+  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
+  CHECK(image_file) << "Unable to open file " << image_filename;
+  CHECK(label_file) << "Unable to open file " << label_filename;
+  // Read the magic and the meta data
+  uint32_t magic;
+  uint32_t num_items;
+  uint32_t num_labels;
+  uint32_t rows;
+  uint32_t cols;
+
+  image_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
+  label_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2050) << "Incorrect label file magic.";
+  image_file.read(reinterpret_cast<char*>(&num_items), 4);
+  num_items = swap_endian(num_items);
+  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
+  num_labels = swap_endian(num_labels);
+  CHECK_EQ(num_items, num_labels);
+  image_file.read(reinterpret_cast<char*>(&rows), 4);
+  rows = swap_endian(rows);
+  image_file.read(reinterpret_cast<char*>(&cols), 4);
+  cols = swap_endian(cols);
+
+  // Open leveldb
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  options.error_if_exists = true;
+  leveldb::Status status = leveldb::DB::Open(
+      options, db_filename, &db);
+  CHECK(status.ok()) << "Failed to open leveldb " << db_filename
+      << ". Is it already existing?";
+
+  char* label_temp = new char[4];  // label for unsigned char*
+  signed char* label_i = new signed char[4];  // label for triplet
+  signed char* label_j = new signed char[4];
+  signed char* label_k = new signed char[4];
+  signed char* label_l = new signed char[4];  // label for pair wise
+  signed char* label_m = new signed char[4];
+  int db_size;
+  if (rgb_use1 == 0)
+    db_size = rows * cols;
+  else
+    db_size = 3 * rows * cols;
+  char* pixels1 = new char[db_size];
+  char* pixels2 = new char[db_size];
+  char* pixels3 = new char[db_size];
+  char* pixels4 = new char[db_size];
+  char* pixels5 = new char[db_size];
+  const int kMaxKeyLength = 10;
+  char key[kMaxKeyLength];
+  std::string value;
+  caffe::Datum datum;
+  if (rgb_use1 == 0)
+    datum.set_channels(1);
+  else
+    datum.set_channels(3);
+  datum.set_height(rows);
+  datum.set_width(cols);
+  LOG(INFO) << "A total of " << num_items << " items.";
+  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
+  int counter = 0;
+  for (unsigned int times = 0; times < 10; ++times) {
+    // iteration in the samples of all class
+    for (unsigned int itemid = 0; itemid < num_items/class_num; ++itemid) {
+      // iteration in the samples in one class
+      for (unsigned int class_ind = 0; class_ind < class_num; ++class_ind) {
+      // use reference sample one by one at each iteration
+      int i = itemid % num_items + class_ind*num_items/class_num;
+      int j = caffe::caffe_rng_rand() % num_items;  // pick triplet groups
+      int k = caffe::caffe_rng_rand() % num_items;
+      int l = caffe::caffe_rng_rand() % num_items;  // pick pair wise groups
+      int m = caffe::caffe_rng_rand() % num_items;
+      read_image(&image_file, &label_file, i, rows, cols,  // read triplet
+        pixels1, label_temp, label_i, rgb_use1);
+      read_image(&image_file, &label_file, j, rows, cols,
+        pixels2, label_temp, label_j, rgb_use1);
+      read_image(&image_file, &label_file, k, rows, cols,
+        pixels3, label_temp, label_k, rgb_use1);
+      read_image(&image_file, &label_file, l, rows, cols,  // read pair wise
+        pixels4, label_temp, label_l, rgb_use1);
+      read_image(&image_file, &label_file, m, rows, cols,
+        pixels5, label_temp, label_m, rgb_use1);
+
+      bool pair_pass = false;
+      bool triplet1_pass = false;
+      bool triplet2_pass = false;
+      bool triplet3_class_same = false;
+      bool triplet3_pass = false;
+
+      int ij_diff_x = static_cast<int>(*(label_i+1)-*(label_j+1));
+      int ij_diff_y = static_cast<int>(*(label_i+2)-*(label_j+2));
+      int ij_diff_z = static_cast<int>(*(label_i+3)-*(label_j+3));
+      int im_diff_x = static_cast<int>(*(label_i+1)-*(label_m+1));
+      int im_diff_y = static_cast<int>(*(label_i+2)-*(label_m+2));
+      int im_diff_z = static_cast<int>(*(label_i+3)-*(label_m+3));
+
+      int ij_x = ij_diff_x*ij_diff_x;
+      int ij_y = ij_diff_y*ij_diff_y;
+      int ij_z = ij_diff_z*ij_diff_z;
+      int im_x = im_diff_x*im_diff_x;
+      int im_y = im_diff_y*im_diff_y;
+      int im_z = im_diff_z*im_diff_z;
+
+      float dist_ij = std::sqrt(ij_x + ij_y + ij_z);
+      float dist_im = std::sqrt(im_x + im_y + im_z);
+      if (*label_i == *label_j && dist_ij < 100/3 && dist_ij != 0)
+        pair_pass = true;
+      if (pair_pass && (*label_i  != *label_k))
+        triplet1_pass = true;
+      if (pair_pass && (*label_i  != *label_l))
+        triplet2_pass = true;
+      if (pair_pass && (*label_i  == *label_m))
+        triplet3_class_same = true;
+      if (triplet3_class_same && dist_im > 100/3)
+        triplet3_pass = true;
+      if (pair_pass && triplet1_pass && triplet2_pass && triplet3_pass) {
+        datum.set_data(pixels1, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_i));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+        datum.set_data(pixels2, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_j));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+        datum.set_data(pixels3, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_k));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+        datum.set_data(pixels4, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_l));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+        datum.set_data(pixels5, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_m));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+      } else {
+        class_ind--;
+      }
+      }  // iteration in the samples of all class
+    }  // iteration in the samples in one class
+  }  // iteration in times
+  delete db;
+  delete pixels1;
+  delete pixels2;
+  delete pixels3;
+  delete pixels4;
+  delete pixels5;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 6) {
+    printf("This script converts the dataset to the leveldb format used\n"
+           "by caffe to train a triplet network.\n"
+           "Usage:\n"
+           "    convert_3d_data input_image_file input_label_file "
+           "output_db_file class_number rgb_use \n");
+  } else {
+    google::InitGoogleLogging(argv[0]);
+    convert_dataset(argv[1], argv[2], argv[3], argv[4], argv[5]);
+  }
+  return 0;
+}
diff --git a/examples/triplet/create_3d_triplet.sh b/examples/triplet/create_3d_triplet.sh
new file mode 100755
index 00000000000..0fadd9b7e09
--- /dev/null
+++ b/examples/triplet/create_3d_triplet.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env sh
+# This script converts the mnist data into leveldb format.
+
+EXAMPLES=./build/examples/triplet
+DATA=./data/linemod
+
+echo "Creating leveldb..."
+
+rm -rf ./examples/triplet/3d_triplet_train_leveldb
+rm -rf ./examples/triplet/3d_triplet_test_leveldb
+
+$EXAMPLES/convert_3d_triplet_data.bin \
+    $DATA/binary_image_train \
+    $DATA/binary_label_train \
+    ./examples/triplet/3d_triplet_train_leveldb \
+    6 \
+    0
+$EXAMPLES/convert_3d_triplet_data.bin \
+    $DATA/binary_image_test \
+    $DATA/binary_label_test \
+    ./examples/triplet/3d_triplet_test_leveldb \
+    6 \
+    0
+echo "Done."
diff --git a/examples/triplet/readme.md b/examples/triplet/readme.md
new file mode 100644
index 00000000000..6636808691a
--- /dev/null
+++ b/examples/triplet/readme.md
@@ -0,0 +1,186 @@
+---
+title: Triplet Network Tutorial
+description: Train and test a triplet network on data generated by 3D model.
+category: example
+include_in_docs: true
+layout: default
+priority: 100
+---
+
+# Triplet Network Training with Caffe
+This example shows how you can use weight sharing and a contrastive loss
+function to learn a model using a triplet network in Caffe.
+
+We will assume that you have caffe successfully compiled. If not, please refer
+to the [Installation page](../../installation.html). This example builds on the
+[MNIST tutorial](mnist.html) so it would be a good idea to read that before
+continuing.
+
+*The guide specifies all paths and assumes all commands are executed from the
+root caffe directory*
+
+## Prepare Datasets
+
+You will first need to convert the data from the some .ply models using
+opencv_contrib cnn_3donj module. After construcing the binary files including images and labels and put them in ./data/linemod folder, just run:
+
+    ./examples/triplet/create_3d_triplet.sh
+
+After running the script there should be two datasets,
+`./examples/triplet/3d_triplet_train_leveldb`, and
+`./examples/triplet/3d_triplet_test_leveldb`.
+
+## The Model
+First, we will define the model that we want to train using the triplet network.
+We will use the convolutional net defined in
+`./examples/triplet/3d_triplet.prototxt`.
+
+layer {
+  name: "feat"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "feat"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4
+  }
+}
+
+## Define the triplet Network
+
+In this section we will define the triplet network used for training. The
+resulting network is defined in
+`./examples/triplet/3d_triplet_train_test.prototxt`.
+
+### Reading in the Triplet Data
+
+We start with a data layer that reads from the LevelDB database we created
+earlier. Each entry in this database contains the image data for a triplet of
+images (`triplet_data`) and the label (`sim`) is not nessesary in our method.
+
+    layers {
+      name: "triplet_data"
+      type: DATA
+      top: "triplet_data"
+      top: "sim"
+      data_param {
+        source: "examples/triplet/3d-triplet-train-leveldb"
+        scale: 0.00390625
+        batch_size: 250
+      }
+      include: { phase: TRAIN }
+    }
+
+In order to pack a triplet of images into the same blob in the database we pack one
+image per channel. We want to be able to work with these three images separately,
+so we add a slice layer after the data layer. This takes the `triplet_data` and
+slices it along the channel dimension so that we have a single image in `data`
+and its positive image in `data_pos.` & its negative image in `data_neg.`, as described in paper for 3D object classification and pose estimation, a pair wise term is also need alone with the triplet part.
+
+layer {
+  name: "slice_triplet"
+  type: "Slice"
+  bottom: "triplet_data"
+  top: "data"
+  top: "data_true"
+  top: "data_false"
+  top: "data_p1"
+  top: "data_p2"
+  slice_param {
+    slice_dim: 1
+    slice_point: 1
+    slice_point: 2
+    slice_point: 3
+    slice_point: 4
+  }
+}
+
+### Building the First part of the triplet Net
+
+Now we can specify the first side of the triplet net. This side operates on
+`data` and produces `feat`. Starting from the net in
+`./examples/triplet/3d_triplet.prototxt` we add default weight fillers. Then
+we name the parameters of the convolutional and inner product layers. Naming the
+parameters allows Caffe to share the parameters between layers on three channels of
+the triplet net. In the definition this looks like:
+
+    ...
+    param: "conv1_w"
+    param: "conv1_b"
+    ...
+    param: "conv2_w"
+    param: "conv2_b"
+    ...
+    param: "ip1_w"
+    param: "ip1_b"
+    ...
+    param: "ip2_w"
+    param: "ip2_b"
+    ...
+
+### Building the Second Side of the triplet Net
+
+Now we need to create the second path that operates on `data_pos` and produces
+`feat_pos`. This path is exactly the same as the first. So we can just copy and
+paste it. Then we change the name of each layer, input, and output by appending
+`_pos` to differentiate the "paired" layers from the originals.
+
+### Building the Third Side of the triplet Net
+
+Now we need to create the second path that operates on `data_neg` and produces
+`feat_neg`. This path is exactly the same as the first. So we can just copy and
+paste it. Then we change the name of each layer, input, and output by appending
+`_neg` to differentiate the "paired" layers from the originals.
+
+### Adding the Triplet Loss Function
+
+To train the network we will optimize a triplet loss function proposed in:
+This cost function is implemented with the `TRIPLET_LOSS` layer:
+
+layer {
+  name: "loss"
+  type: "TripletLoss"
+  bottom: "feat"
+  bottom: "feat_true"
+  bottom: "feat_false"
+  bottom: "feat_p1"
+  bottom: "feat_p2"
+  bottom: "sim"
+  top: "loss"
+  triplet_loss_param {
+    margin: 1
+    losstype: 1
+  }
+}
+
+## Define the Solver
+
+Nothing special needs to be done to the solver besides pointing it at the
+correct model file. The solver is defined in
+`./examples/triplet/3d_triplet_solver.prototxt`.
+
+## Training and Testing the Model
+
+Training the model is simple after you have written the network definition
+protobuf and solver protobuf files. Simply run
+`./examples/triplet/train_mnist_triplet.sh`:
+
+    ./examples/triplet/train_3d_triplet.sh
+
+# Plotting the results
+
+First, we can draw the model and triplet networks by running the following
+commands that draw the DAGs defined in the .prototxt files:
+
+    ./python/draw_net.py \
+        ./examples/triplet/3d_triplet.prototxt \
+        ./examples/triplet/3d_triplet.png
+
+    ./python/draw_net.py \
+        ./examples/triplet/3d_triplet_train_test.prototxt \
+        ./examples/triplet/3d_triplet_train_test.png
\ No newline at end of file
diff --git a/examples/triplet/train_3d_triplet.sh b/examples/triplet/train_3d_triplet.sh
new file mode 100755
index 00000000000..e421af54493
--- /dev/null
+++ b/examples/triplet/train_3d_triplet.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env sh
+
+TOOLS=./build/tools
+
+$TOOLS/caffe train --solver=examples/triplet/3d_triplet_solver.prototxt
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index fea5117ef10..dda7b1f8372 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -219,7 +219,6 @@ class Blob {
 
   const Dtype* cpu_data() const;
   void set_cpu_data(Dtype* data);
-  const int* gpu_shape() const;
   const Dtype* gpu_data() const;
   const Dtype* cpu_diff() const;
   const Dtype* gpu_diff() const;
@@ -269,7 +268,6 @@ class Blob {
  protected:
   shared_ptr<SyncedMemory> data_;
   shared_ptr<SyncedMemory> diff_;
-  shared_ptr<SyncedMemory> shape_data_;
   vector<int> shape_;
   int count_;
   int capacity_;
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 89bab8d6f3a..8e64b3e5dc5 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -85,7 +85,7 @@ class ConcatLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top);
 
   virtual inline const char* type() const { return "Concat"; }
-  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MinBottomBlobs() const { return 2; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
@@ -625,7 +625,7 @@ class SliceLayer : public Layer<Dtype> {
 
   virtual inline const char* type() const { return "Slice"; }
   virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 2; }
 
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index 90fd0d19917..552d814131e 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -4,6 +4,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "hdf5.h"
 
 #include "caffe/blob.hpp"
@@ -274,10 +275,8 @@ class MemoryDataLayer : public BaseDataLayer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 2; }
 
   virtual void AddDatumVector(const vector<Datum>& datum_vector);
-#ifdef USE_OPENCV
   virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
       const vector<int>& labels);
-#endif  // USE_OPENCV
 
   // Reset should accept const pointers, but can't, because the memory
   //  will be given to Blob, which is mutable
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
index 97b4ee6a8c4..0ad68c80216 100644
--- a/include/caffe/data_transformer.hpp
+++ b/include/caffe/data_transformer.hpp
@@ -50,7 +50,6 @@ class DataTransformer {
   void Transform(const vector<Datum> & datum_vector,
                 Blob<Dtype>* transformed_blob);
 
-#ifdef USE_OPENCV
   /**
    * @brief Applies the transformation defined in the data layer's
    * transform_param block to a vector of Mat.
@@ -75,7 +74,6 @@ class DataTransformer {
    *    set_cpu_data() is used. See image_data_layer.cpp for an example.
    */
   void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
-#endif  // USE_OPENCV
 
   /**
    * @brief Applies the same transformation defined in the data layer's
@@ -115,7 +113,6 @@ class DataTransformer {
    * @param mat_vector
    *    A vector of Mat containing the data to be transformed.
    */
-#ifdef USE_OPENCV
   vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
   /**
    * @brief Infers the shape of transformed_blob will have when
@@ -125,7 +122,6 @@ class DataTransformer {
    *    cv::Mat containing the data to be transformed.
    */
   vector<int> InferBlobShape(const cv::Mat& cv_img);
-#endif  // USE_OPENCV
 
  protected:
    /**
@@ -152,3 +148,4 @@ class DataTransformer {
 }  // namespace caffe
 
 #endif  // CAFFE_DATA_TRANSFORMER_HPP_
+
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 8d41af34e88..0a513ae12a4 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -216,6 +216,73 @@ class ContrastiveLossLayer : public LossLayer<Dtype> {
   Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
 };
 
+template <typename Dtype>
+class TripletLossLayer : public LossLayer<Dtype> {
+ public:
+  explicit TripletLossLayer(const LayerParameter& param)
+      : LossLayer<Dtype>(param), diff_() {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  virtual inline const char* type() const { return "TripletLoss"; }
+  /**
+   * Unlike most loss layers, in the TripletLossLayer we can backpropagate
+   * to the first three inputs.
+   */
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    return bottom_index != 1;
+  }
+
+ protected:
+  /// @copydoc TripletLossLayer
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the Triplet error gradient w.r.t. the inputs.
+   *
+   * Computes the gradients with respect to the two input vectors (bottom[0] and
+   * bottom[1]), but not the similarity label (bottom[2]).
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
+   *      @f$\ell_i@f$ in the overall Net loss
+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
+   *      other layer of the Net.)
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (N \times C \times 1 \times 1) @f$
+   *      the features @f$a@f$; Backward fills their diff with
+   *      gradients if propagate_down[0]
+   *   -# @f$ (N \times C \times 1 \times 1) @f$
+   *      the features @f$b@f$; Backward fills their diff with gradients if
+   *      propagate_down[1]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Blob<Dtype> diff_;  // cached for backward pass
+  Blob<Dtype> diff_pos;
+  Blob<Dtype> diff_neg;
+  Blob<Dtype> dist_sq_;  // cached for backward pass
+  Blob<Dtype> dist_sq_pos;
+  Blob<Dtype> dist_sq_neg;
+  Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
+  Blob<Dtype> diff_sq_pos;
+  Blob<Dtype> diff_sq_neg;
+  Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
+};
+
 /**
  * @brief Computes the Euclidean (L2) loss @f$
  *          E = \frac{1}{2N} \sum\limits_{n=1}^N \left| \left| \hat{y}_n - y_n
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 2ecf539baef..aba3e036004 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -82,8 +82,6 @@ class Solver {
     callbacks_.push_back(value);
   }
 
-  void CheckSnapshotWritePermissions();
-
  protected:
   // Make and apply the update value for the current iteration.
   virtual void ApplyUpdate() = 0;
@@ -283,19 +281,19 @@ Solver<Dtype>* GetSolver(const SolverParameter& param) {
 
   switch (type) {
   case SolverParameter_SolverType_SGD:
-    return new SGDSolver<Dtype>(param);
+      return new SGDSolver<Dtype>(param);
   case SolverParameter_SolverType_NESTEROV:
-    return new NesterovSolver<Dtype>(param);
+      return new NesterovSolver<Dtype>(param);
   case SolverParameter_SolverType_ADAGRAD:
-    return new AdaGradSolver<Dtype>(param);
+      return new AdaGradSolver<Dtype>(param);
   case SolverParameter_SolverType_RMSPROP:
-    return new RMSPropSolver<Dtype>(param);
+      return new RMSPropSolver<Dtype>(param);
   case SolverParameter_SolverType_ADADELTA:
-    return new AdaDeltaSolver<Dtype>(param);
+      return new AdaDeltaSolver<Dtype>(param);
   case SolverParameter_SolverType_ADAM:
-    return new AdamSolver<Dtype>(param);
+      return new AdamSolver<Dtype>(param);
   default:
-    LOG(FATAL) << "Unknown SolverType: " << type;
+      LOG(FATAL) << "Unknown SolverType: " << type;
   }
   return (Solver<Dtype>*) NULL;
 }
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index e9fa0d32b66..10623554b67 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -1,4 +1,3 @@
-#ifdef USE_LEVELDB
 #ifndef CAFFE_UTIL_DB_LEVELDB_HPP
 #define CAFFE_UTIL_DB_LEVELDB_HPP
 
@@ -72,4 +71,3 @@ class LevelDB : public DB {
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_DB_LEVELDB_HPP
-#endif  // USE_LEVELDB
diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp
index 4e1568ace50..cc7c90afc4c 100644
--- a/include/caffe/util/db_lmdb.hpp
+++ b/include/caffe/util/db_lmdb.hpp
@@ -1,4 +1,3 @@
-#ifdef USE_LMDB
 #ifndef CAFFE_UTIL_DB_LMDB_HPP
 #define CAFFE_UTIL_DB_LMDB_HPP
 
@@ -90,4 +89,3 @@ class LMDB : public DB {
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_DB_LMDB_HPP
-#endif  // USE_LMDB
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 531fd29c57a..0051e2fa067 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -3,48 +3,24 @@
 
 namespace caffe {
 
-template <typename Dtype>
-void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col);
-
 template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_col);
 
-template <typename Dtype>
-void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im);
-
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_im);
 
-template <typename Dtype>
-void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes,
-    const int col_size, const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col);
-
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_col);
 
-template <typename Dtype>
-void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes,
-    const int im_size, const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im);
-
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index 6070b4c7f3a..c0938ad0625 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -120,7 +120,6 @@ inline bool ReadImageToDatum(const string& filename, const int label,
 bool DecodeDatumNative(Datum* datum);
 bool DecodeDatum(Datum* datum, bool is_color);
 
-#ifdef USE_OPENCV
 cv::Mat ReadImageToCVMat(const string& filename,
     const int height, const int width, const bool is_color);
 
@@ -136,7 +135,6 @@ cv::Mat DecodeDatumToCVMatNative(const Datum& datum);
 cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);
 
 void CVMatToDatum(const cv::Mat& cv_img, Datum* datum);
-#endif  // USE_OPENCV
 
 }  // namespace caffe
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 06bc0457e2d..211e3d9042d 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -58,110 +58,52 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   void backward_gpu_bias(Dtype* bias, const Dtype* input);
 #endif
 
-  /// @brief The spatial dimensions of the input.
-  inline int input_shape(int i) {
-    return (*bottom_shape_)[channel_axis_ + i];
-  }
   // reverse_dimensions should return true iff we are implementing deconv, so
   // that conv helpers know which dimensions are which.
   virtual bool reverse_dimensions() = 0;
   // Compute height_out_ and width_out_ from other parameters.
   virtual void compute_output_shape() = 0;
 
-  /// @brief The spatial dimensions of a filter kernel.
-  Blob<int> kernel_shape_;
-  /// @brief The spatial dimensions of the stride.
-  Blob<int> stride_;
-  /// @brief The spatial dimensions of the padding.
-  Blob<int> pad_;
-  /// @brief The spatial dimensions of the convolution input.
-  Blob<int> conv_input_shape_;
-  /// @brief The spatial dimensions of the col_buffer.
-  vector<int> col_buffer_shape_;
-  /// @brief The spatial dimensions of the output.
-  vector<int> output_shape_;
-  const vector<int>* bottom_shape_;
-
-  int num_spatial_axes_;
-  int bottom_dim_;
-  int top_dim_;
-
-  int channel_axis_;
+  int kernel_h_, kernel_w_;
+  int stride_h_, stride_w_;
   int num_;
   int channels_;
+  int pad_h_, pad_w_;
+  int height_, width_;
   int group_;
-  int out_spatial_dim_;
-  int weight_offset_;
   int num_output_;
+  int height_out_, width_out_;
   bool bias_term_;
   bool is_1x1_;
-  bool force_nd_im2col_;
 
  private:
   // wrap im2col/col2im so we don't have to remember the (long) argument lists
   inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      im2col_cpu(data, conv_in_channels_,
-          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
-    } else {
-      im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
-          col_buffer_shape_.data(), kernel_shape_.cpu_data(),
-          pad_.cpu_data(), stride_.cpu_data(), col_buff);
-    }
+    im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
   }
   inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      col2im_cpu(col_buff, conv_in_channels_,
-          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1], data);
-    } else {
-      col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(),
-          col_buffer_shape_.data(), kernel_shape_.cpu_data(),
-          pad_.cpu_data(), stride_.cpu_data(), data);
-    }
+    col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
   }
 #ifndef CPU_ONLY
   inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      im2col_gpu(data, conv_in_channels_,
-          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
-    } else {
-      im2col_nd_gpu(data, num_spatial_axes_, num_kernels_im2col_,
-          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
-          kernel_shape_.gpu_data(), pad_.gpu_data(),
-          stride_.gpu_data(), col_buff);
-    }
+    im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
   }
   inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      col2im_gpu(col_buff, conv_in_channels_,
-          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1], data);
-    } else {
-      col2im_nd_gpu(col_buff, num_spatial_axes_, num_kernels_col2im_,
-          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
-          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
-          data);
-    }
+    col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
   }
 #endif
 
-  int num_kernels_im2col_;
-  int num_kernels_col2im_;
   int conv_out_channels_;
   int conv_in_channels_;
   int conv_out_spatial_dim_;
+  int conv_in_height_;
+  int conv_in_width_;
   int kernel_dim_;
+  int weight_offset_;
   int col_offset_;
   int output_offset_;
 
@@ -308,7 +250,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   cudnnTensorDescriptor_t    bias_desc_;
   cudnnFilterDescriptor_t      filter_desc_;
   vector<cudnnConvolutionDescriptor_t> conv_descs_;
-  int bottom_offset_, top_offset_, bias_offset_;
+  int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
   size_t workspaceSizeInBytes;
   void *workspace;
 };
@@ -345,22 +287,11 @@ class Im2colLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-  /// @brief The spatial dimensions of a filter kernel.
-  Blob<int> kernel_shape_;
-  /// @brief The spatial dimensions of the stride.
-  Blob<int> stride_;
-  /// @brief The spatial dimensions of the padding.
-  Blob<int> pad_;
-
-  int num_spatial_axes_;
-  int bottom_dim_;
-  int top_dim_;
-
-  int channel_axis_;
-  int num_;
+  int kernel_h_, kernel_w_;
+  int stride_h_, stride_w_;
   int channels_;
-
-  bool force_nd_im2col_;
+  int height_, width_;
+  int pad_h_, pad_w_;
 };
 
 // Forward declare PoolingLayer and SplitLayer for use in LRNLayer.
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index ccda1bcae4f..6cc44e729f4 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,4 +1,4 @@
-from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
+from .pycaffe import Net, SGDSolver
 from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index ccd5776ac40..cc49f60ab13 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -297,15 +297,6 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::class_<AdaGradSolver<Dtype>, bp::bases<Solver<Dtype> >,
     shared_ptr<AdaGradSolver<Dtype> >, boost::noncopyable>(
         "AdaGradSolver", bp::init<string>());
-  bp::class_<RMSPropSolver<Dtype>, bp::bases<Solver<Dtype> >,
-    shared_ptr<RMSPropSolver<Dtype> >, boost::noncopyable>(
-        "RMSPropSolver", bp::init<string>());
-  bp::class_<AdaDeltaSolver<Dtype>, bp::bases<Solver<Dtype> >,
-    shared_ptr<AdaDeltaSolver<Dtype> >, boost::noncopyable>(
-        "AdaDeltaSolver", bp::init<string>());
-  bp::class_<AdamSolver<Dtype>, bp::bases<Solver<Dtype> >,
-    shared_ptr<AdamSolver<Dtype> >, boost::noncopyable>(
-        "AdamSolver", bp::init<string>());
 
   bp::def("get_solver", &GetSolverFromFile,
       bp::return_value_policy<bp::manage_new_object>());
diff --git a/python/caffe/io.py b/python/caffe/io.py
index 0cad7211291..fc96266085f 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -329,7 +329,7 @@ def resize_image(im, new_dims, interp_order=1):
             return ret
     else:
         # ndimage interpolates anything but more slowly.
-        scale = tuple(np.array(new_dims, dtype=float) / np.array(im.shape[:2]))
+        scale = tuple(np.array(new_dims) / np.array(im.shape[:2]))
         resized_im = zoom(im, scale + (1,), order=interp_order)
     return resized_im.astype(np.float32)
 
diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index 93fc01927db..77a0e0070ae 100644
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -56,14 +56,8 @@ def to_proto(*tops):
 def assign_proto(proto, name, val):
     """Assign a Python object to a protobuf message, based on the Python
     type (in recursive fashion). Lists become repeated fields/messages, dicts
-    become messages, and other types are assigned directly. For convenience,
-    repeated fields whose values are not lists are converted to single-element
-    lists; e.g., `my_repeated_int_field=3` is converted to
-    `my_repeated_int_field=[3]`."""
-
-    is_repeated_field = hasattr(getattr(proto, name), 'extend')
-    if is_repeated_field and not isinstance(val, list):
-        val = [val]
+    become messages, and other types are assigned directly."""
+
     if isinstance(val, list):
         if isinstance(val[0], dict):
             for item in val:
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 8ea24da4fdd..4f980a92c38 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -10,8 +10,7 @@
     from itertools import zip_longest as izip_longest
 import numpy as np
 
-from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
-        RMSPropSolver, AdaDeltaSolver, AdamSolver
+from ._caffe import Net, SGDSolver
 import caffe.io
 
 # We directly update methods from Net here (rather than using composition or
diff --git a/python/caffe/test/test_layer_type_list.py b/python/caffe/test/test_layer_type_list.py
index 47f4cf6d008..7edc80df069 100644
--- a/python/caffe/test/test_layer_type_list.py
+++ b/python/caffe/test/test_layer_type_list.py
@@ -5,7 +5,6 @@
 class TestLayerTypeList(unittest.TestCase):
 
     def test_standard_types(self):
-        #removing 'Data' from list 
         for type_name in ['Data', 'Convolution', 'InnerProduct']:
             self.assertIn(type_name, caffe.layer_type_list(),
                     '%s not in layer_type_list()' % type_name)
diff --git a/python/caffe/test/test_net_spec.py b/python/caffe/test/test_net_spec.py
index fee3c0aaebe..b4595e6531a 100644
--- a/python/caffe/test/test_net_spec.py
+++ b/python/caffe/test/test_net_spec.py
@@ -43,7 +43,8 @@ def anon_lenet(batch_size):
 
 def silent_net():
     n = caffe.NetSpec()
-    n.data, n.data2 = L.DummyData(shape=dict(dim=3), ntop=2)
+    n.data, n.data2 = L.DummyData(shape=[dict(dim=[3]), dict(dim=[4, 2])],
+                                  ntop=2)
     n.silence_data = L.Silence(n.data, ntop=0)
     n.silence_data2 = L.Silence(n.data2, ntop=0)
     return n.to_proto()
diff --git a/scripts/travis/travis_build_and_test.sh b/scripts/travis/travis_build_and_test.sh
index 174f1ee5a0a..9ba737e28a9 100755
--- a/scripts/travis/travis_build_and_test.sh
+++ b/scripts/travis/travis_build_and_test.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-# Script called by Travis to build and test Caffe.
-# Travis CI tests are CPU-only for lack of compatible hardware.
+# Script called by Travis to do a CPU-only build of and test Caffe.
 
 set -e
 MAKE="make --jobs=$NUM_THREADS --keep-going"
@@ -16,12 +15,7 @@ if $WITH_CMAKE; then
   if [ "$PYTHON_VERSION" = "3" ]; then
     PYTHON_ARGS="$PYTHON_ARGS -Dpython_version=3 -DBOOST_LIBRARYDIR=$CONDA_DIR/lib/"
   fi
-  if $WITH_IO; then
-    IO_ARGS="-DUSE_OPENCV=ON -DUSE_LMDB=ON -DUSE_LEVELDB=ON"
-  else
-    IO_ARGS="-DUSE_OPENCV=OFF -DUSE_LMDB=OFF -DUSE_LEVELDB=OFF"
-  fi
-  cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" $IO_ARGS ..
+  cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" ..
   $MAKE
   $MAKE pytest
   if ! $WITH_CUDA; then
@@ -34,11 +28,6 @@ else
   if ! $WITH_CUDA; then
     export CPU_ONLY=1
   fi
-  if $WITH_IO; then
-    export USE_LMDB=1
-    export USE_LEVELDB=1
-    export USE_OPENCV=1
-  fi
   $MAKE all test pycaffe warn lint || true
   if ! $WITH_CUDA; then
     $MAKE runtest
diff --git a/scripts/travis/travis_setup_makefile_config.sh b/scripts/travis/travis_setup_makefile_config.sh
index 83aacf11fb0..1440be2af8b 100755
--- a/scripts/travis/travis_setup_makefile_config.sh
+++ b/scripts/travis/travis_setup_makefile_config.sh
@@ -11,12 +11,6 @@ if $WITH_CUDA; then
   echo "CUDA_ARCH := $GENCODE" >> Makefile.config
 fi
 
-# Remove IO library settings from Makefile.config
-# to avoid conflicts with CI configuration
-sed -i -e '/USE_LMDB/d' Makefile.config
-sed -i -e '/USE_LEVELDB/d' Makefile.config
-sed -i -e '/USE_OPENCV/d' Makefile.config
-
 cat << 'EOF' >> Makefile.config
 # Travis' nvcc doesn't like newer boost versions
 NVCCFLAGS := -Xcudafe --diag_suppress=cc_clobber_ignored -Xcudafe --diag_suppress=useless_using_declaration -Xcudafe --diag_suppress=set_but_not_used
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index c86fd5d1d94..8450aa140be 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -24,16 +24,11 @@ void Blob<Dtype>::Reshape(const vector<int>& shape) {
   CHECK_LE(shape.size(), kMaxBlobAxes);
   count_ = 1;
   shape_.resize(shape.size());
-  if (!shape_data_ || shape_data_->size() < shape.size() * sizeof(int)) {
-    shape_data_.reset(new SyncedMemory(shape.size() * sizeof(int)));
-  }
-  int* shape_data = static_cast<int*>(shape_data_->mutable_cpu_data());
   for (int i = 0; i < shape.size(); ++i) {
     CHECK_GE(shape[i], 0);
     CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
     count_ *= shape[i];
     shape_[i] = shape[i];
-    shape_data[i] = shape[i];
   }
   if (count_ > capacity_) {
     capacity_ = count_;
@@ -72,12 +67,6 @@ Blob<Dtype>::Blob(const vector<int>& shape)
   Reshape(shape);
 }
 
-template <typename Dtype>
-const int* Blob<Dtype>::gpu_shape() const {
-  CHECK(shape_data_);
-  return (const int*)shape_data_->gpu_data();
-}
-
 template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_data() const {
   CHECK(data_);
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 7189d67e289..4666d9bd881 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -1,6 +1,4 @@
-#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
-#endif  // USE_OPENCV
 
 #include <string>
 #include <vector>
@@ -126,13 +124,11 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
   }
 }
 
-
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
                                        Blob<Dtype>* transformed_blob) {
   // If datum is encoded, decoded and transform the cv::image.
   if (datum.encoded()) {
-#ifdef USE_OPENCV
     CHECK(!(param_.force_color() && param_.force_gray()))
         << "cannot set both force_color and force_gray";
     cv::Mat cv_img;
@@ -144,9 +140,6 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
     }
     // Transform the cv::image into blob.
     return Transform(cv_img, transformed_blob);
-#else
-    LOG(FATAL) << "Encoded datum requires OpenCV; compile with USE_OPENCV.";
-#endif  // USE_OPENCV
   } else {
     if (param_.force_color() || param_.force_gray()) {
       LOG(ERROR) << "force_color and force_gray only for encoded datum";
@@ -201,7 +194,6 @@ void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
   }
 }
 
-#ifdef USE_OPENCV
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
                                        Blob<Dtype>* transformed_blob) {
@@ -323,7 +315,6 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
     }
   }
 }
-#endif  // USE_OPENCV
 
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
@@ -441,7 +432,6 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
 template<typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
   if (datum.encoded()) {
-#ifdef USE_OPENCV
     CHECK(!(param_.force_color() && param_.force_gray()))
         << "cannot set both force_color and force_gray";
     cv::Mat cv_img;
@@ -453,10 +443,8 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
     }
     // InferBlobShape using the cv::image.
     return InferBlobShape(cv_img);
-#else
-    LOG(FATAL) << "Encoded datum requires OpenCV; compile with USE_OPENCV.";
-#endif  // USE_OPENCV
   }
+
   const int crop_size = param_.crop_size();
   const int datum_channels = datum.channels();
   const int datum_height = datum.height();
@@ -486,7 +474,6 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(
   return shape;
 }
 
-#ifdef USE_OPENCV
 template<typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
   const int crop_size = param_.crop_size();
@@ -517,7 +504,6 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(
   shape[0] = num;
   return shape;
 }
-#endif  // USE_OPENCV
 
 template <typename Dtype>
 void DataTransformer<Dtype>::InitRand() {
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index c6b47550292..ccb3adc7e89 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -1,4 +1,3 @@
-#include <algorithm>
 #include <vector>
 
 #include "caffe/filler.hpp"
@@ -12,97 +11,50 @@ namespace caffe {
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
   // Configure the kernel size, padding, stride, and inputs.
   ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-  force_nd_im2col_ = conv_param.force_nd_im2col();
-  channel_axis_ = bottom[0]->CanonicalAxisIndex(conv_param.axis());
-  const int first_spatial_axis = channel_axis_ + 1;
-  const int num_axes = bottom[0]->num_axes();
-  num_spatial_axes_ = num_axes - first_spatial_axis;
-  CHECK_GE(num_spatial_axes_, 0);
-  vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
-  vector<int> spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1));
-  // Setup filter kernel dimensions (kernel_shape_).
-  kernel_shape_.Reshape(spatial_dim_blob_shape);
-  int* kernel_shape_data = kernel_shape_.mutable_cpu_data();
-  if (conv_param.has_kernel_h() || conv_param.has_kernel_w()) {
-    CHECK_EQ(num_spatial_axes_, 2)
-        << "kernel_h & kernel_w can only be used for 2D convolution.";
-    CHECK_EQ(0, conv_param.kernel_size_size())
-        << "Either kernel_size or kernel_h/w should be specified; not both.";
-    kernel_shape_data[0] = conv_param.kernel_h();
-    kernel_shape_data[1] = conv_param.kernel_w();
+  CHECK(!conv_param.has_kernel_size() !=
+      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+  CHECK(conv_param.has_kernel_size() ||
+      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "For non-square filters both kernel_h and kernel_w are required.";
+  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+      && conv_param.has_pad_w())
+      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+      << "pad is pad OR pad_h and pad_w are required.";
+  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+      && conv_param.has_stride_w())
+      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+      << "Stride is stride OR stride_h and stride_w are required.";
+  if (conv_param.has_kernel_size()) {
+    kernel_h_ = kernel_w_ = conv_param.kernel_size();
   } else {
-    const int num_kernel_dims = conv_param.kernel_size_size();
-    CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_)
-        << "kernel_size must be specified once, or once per spatial dimension "
-        << "(kernel_size specified " << num_kernel_dims << " times; "
-        << num_spatial_axes_ << " spatial dims);";
-      for (int i = 0; i < num_spatial_axes_; ++i) {
-        kernel_shape_data[i] =
-            conv_param.kernel_size((num_kernel_dims == 1) ? 0 : i);
-      }
+    kernel_h_ = conv_param.kernel_h();
+    kernel_w_ = conv_param.kernel_w();
   }
-  for (int i = 0; i < num_spatial_axes_; ++i) {
-    CHECK_GT(kernel_shape_data[i], 0) << "Filter dimensions must be nonzero.";
-  }
-  // Setup stride dimensions (stride_).
-  stride_.Reshape(spatial_dim_blob_shape);
-  int* stride_data = stride_.mutable_cpu_data();
-  if (conv_param.has_stride_h() || conv_param.has_stride_w()) {
-    CHECK_EQ(num_spatial_axes_, 2)
-        << "stride_h & stride_w can only be used for 2D convolution.";
-    CHECK_EQ(0, conv_param.stride_size())
-        << "Either stride or stride_h/w should be specified; not both.";
-    stride_data[0] = conv_param.stride_h();
-    stride_data[1] = conv_param.stride_w();
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+  if (!conv_param.has_pad_h()) {
+    pad_h_ = pad_w_ = conv_param.pad();
   } else {
-    const int num_stride_dims = conv_param.stride_size();
-    CHECK(num_stride_dims == 0 || num_stride_dims == 1 ||
-          num_stride_dims == num_spatial_axes_)
-        << "stride must be specified once, or once per spatial dimension "
-        << "(stride specified " << num_stride_dims << " times; "
-        << num_spatial_axes_ << " spatial dims);";
-    const int kDefaultStride = 1;
-    for (int i = 0; i < num_spatial_axes_; ++i) {
-      stride_data[i] = (num_stride_dims == 0) ? kDefaultStride :
-          conv_param.stride((num_stride_dims == 1) ? 0 : i);
-      CHECK_GT(stride_data[i], 0) << "Stride dimensions must be nonzero.";
-    }
+    pad_h_ = conv_param.pad_h();
+    pad_w_ = conv_param.pad_w();
   }
-  // Setup pad dimensions (pad_).
-  pad_.Reshape(spatial_dim_blob_shape);
-  int* pad_data = pad_.mutable_cpu_data();
-  if (conv_param.has_pad_h() || conv_param.has_pad_w()) {
-    CHECK_EQ(num_spatial_axes_, 2)
-        << "pad_h & pad_w can only be used for 2D convolution.";
-    CHECK_EQ(0, conv_param.pad_size())
-        << "Either pad or pad_h/w should be specified; not both.";
-    pad_data[0] = conv_param.pad_h();
-    pad_data[1] = conv_param.pad_w();
+  if (!conv_param.has_stride_h()) {
+    stride_h_ = stride_w_ = conv_param.stride();
   } else {
-    const int num_pad_dims = conv_param.pad_size();
-    CHECK(num_pad_dims == 0 || num_pad_dims == 1 ||
-          num_pad_dims == num_spatial_axes_)
-        << "pad must be specified once, or once per spatial dimension "
-        << "(pad specified " << num_pad_dims << " times; "
-        << num_spatial_axes_ << " spatial dims);";
-    const int kDefaultPad = 0;
-    for (int i = 0; i < num_spatial_axes_; ++i) {
-      pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
-          conv_param.pad((num_pad_dims == 1) ? 0 : i);
-    }
+    stride_h_ = conv_param.stride_h();
+    stride_w_ = conv_param.stride_w();
   }
   // Special case: im2col is the identity for 1x1 convolution with stride 1
   // and no padding, so flag for skipping the buffer and transformation.
-  is_1x1_ = true;
-  for (int i = 0; i < num_spatial_axes_; ++i) {
-    is_1x1_ &=
-        kernel_shape_data[i] == 1 && stride_data[i] == 1 && pad_data[i] == 0;
-    if (!is_1x1_) { break; }
-  }
+  is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1
+      && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0;
   // Configure output channels and groups.
-  channels_ = bottom[0]->shape(channel_axis_);
+  channels_ = bottom[0]->channels();
   num_output_ = this->layer_param_.convolution_param().num_output();
   CHECK_GT(num_output_, 0);
   group_ = this->layer_param_.convolution_param().group();
@@ -119,29 +71,8 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // Handle the parameters: weights and biases.
   // - blobs_[0] holds the filter weights
   // - blobs_[1] holds the biases (optional)
-  vector<int> weight_shape(2);
-  weight_shape[0] = conv_out_channels_;
-  weight_shape[1] = conv_in_channels_ / group_;
-  for (int i = 0; i < num_spatial_axes_; ++i) {
-    weight_shape.push_back(kernel_shape_data[i]);
-  }
   bias_term_ = this->layer_param_.convolution_param().bias_term();
-  vector<int> bias_shape(bias_term_, num_output_);
   if (this->blobs_.size() > 0) {
-    CHECK_EQ(1 + bias_term_, this->blobs_.size())
-        << "Incorrect number of weight blobs.";
-    if (weight_shape != this->blobs_[0]->shape()) {
-      Blob<Dtype> weight_shaped_blob(weight_shape);
-      LOG(FATAL) << "Incorrect weight shape: expected shape "
-          << weight_shaped_blob.shape_string() << "; instead, shape was "
-          << this->blobs_[0]->shape_string();
-    }
-    if (bias_term_ && bias_shape != this->blobs_[1]->shape()) {
-      Blob<Dtype> bias_shaped_blob(bias_shape);
-      LOG(FATAL) << "Incorrect bias shape: expected shape "
-          << bias_shaped_blob.shape_string() << "; instead, shape was "
-          << this->blobs_[1]->shape_string();
-    }
     LOG(INFO) << "Skipping parameter initialization";
   } else {
     if (bias_term_) {
@@ -151,20 +82,20 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     }
     // Initialize and fill the weights:
     // output channels x input channels per-group x kernel height x kernel width
-    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
+    this->blobs_[0].reset(new Blob<Dtype>(
+        conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_));
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
         this->layer_param_.convolution_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
     // If necessary, initialize and fill the biases.
     if (bias_term_) {
+      vector<int> bias_shape(1, num_output_);
       this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
       shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
           this->layer_param_.convolution_param().bias_filler()));
       bias_filler->Fill(this->blobs_[1].get());
     }
   }
-  kernel_dim_ = this->blobs_[0]->count(1);
-  weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
   // Propagate gradients to the parameters (as directed by backward pass).
   this->param_propagate_down_.resize(this->blobs_.size(), true);
 }
@@ -172,68 +103,52 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  const int first_spatial_axis = channel_axis_ + 1;
-  CHECK_EQ(bottom[0]->num_axes(), first_spatial_axis + num_spatial_axes_)
-      << "bottom num_axes may not change.";
-  num_ = bottom[0]->count(0, channel_axis_);
-  CHECK_EQ(bottom[0]->shape(channel_axis_), channels_)
-      << "Input size incompatible with convolution kernel.";
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  num_ = bottom[0]->num();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
+    " convolution kernel.";
   // TODO: generalize to handle inputs of different shapes.
   for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
-    CHECK(bottom[0]->shape() == bottom[bottom_id]->shape())
-        << "All inputs must have the same shape.";
+    CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
+    CHECK_EQ(channels_, bottom[bottom_id]->channels())
+        << "Inputs must have same channels.";
+    CHECK_EQ(height_, bottom[bottom_id]->height())
+        << "Inputs must have same height.";
+    CHECK_EQ(width_, bottom[bottom_id]->width())
+        << "Inputs must have same width.";
   }
   // Shape the tops.
-  bottom_shape_ = &bottom[0]->shape();
   compute_output_shape();
-  vector<int> top_shape(bottom[0]->shape().begin(),
-      bottom[0]->shape().begin() + channel_axis_);
-  top_shape.push_back(num_output_);
-  for (int i = 0; i < num_spatial_axes_; ++i) {
-    top_shape.push_back(output_shape_[i]);
-  }
   for (int top_id = 0; top_id < top.size(); ++top_id) {
-    top[top_id]->Reshape(top_shape);
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
   }
   if (reverse_dimensions()) {
-    conv_out_spatial_dim_ = bottom[0]->count(first_spatial_axis);
+    conv_in_height_ = height_out_;
+    conv_in_width_ = width_out_;
+    conv_out_spatial_dim_ = height_ * width_;
   } else {
-    conv_out_spatial_dim_ = top[0]->count(first_spatial_axis);
+    conv_in_height_ = height_;
+    conv_in_width_ = width_;
+    conv_out_spatial_dim_ = height_out_ * width_out_;
   }
-  col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
+  kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_;
+  weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_;
+  col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_;
   output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
-  // Setup input dimensions (conv_input_shape_).
-  vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
-  conv_input_shape_.Reshape(bottom_dim_blob_shape);
-  int* conv_input_shape_data = conv_input_shape_.mutable_cpu_data();
-  for (int i = 0; i < num_spatial_axes_ + 1; ++i) {
-    if (reverse_dimensions()) {
-      conv_input_shape_data[i] = top[0]->shape(channel_axis_ + i);
-    } else {
-      conv_input_shape_data[i] = bottom[0]->shape(channel_axis_ + i);
-    }
-  }
   // The im2col result buffer will only hold one image at a time to avoid
   // overly large memory usage. In the special case of 1x1 convolution
   // it goes lazily unused to save memory.
-  col_buffer_shape_.clear();
-  col_buffer_shape_.push_back(kernel_dim_ * group_);
-  for (int i = 0; i < num_spatial_axes_; ++i) {
-    if (reverse_dimensions()) {
-      col_buffer_shape_.push_back(input_shape(i + 1));
-    } else {
-      col_buffer_shape_.push_back(output_shape_[i]);
-    }
+  if (reverse_dimensions()) {
+    col_buffer_.Reshape(1, kernel_dim_, height_, width_);
+  } else {
+    col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_);
   }
-  col_buffer_.Reshape(col_buffer_shape_);
-  bottom_dim_ = bottom[0]->count(channel_axis_);
-  top_dim_ = top[0]->count(channel_axis_);
-  num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
-  num_kernels_col2im_ = reverse_dimensions() ? top_dim_ : bottom_dim_;
   // Set up the all ones "bias multiplier" for adding biases by BLAS
-  out_spatial_dim_ = top[0]->count(first_spatial_axis);
   if (bias_term_) {
-    vector<int> bias_multiplier_shape(1, out_spatial_dim_);
+    vector<int> bias_multiplier_shape(1, height_out_ * width_out_);
     bias_multiplier_.Reshape(bias_multiplier_shape);
     caffe_set(bias_multiplier_.count(), Dtype(1),
         bias_multiplier_.mutable_cpu_data());
@@ -252,7 +167,7 @@ void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
   }
   for (int g = 0; g < group_; ++g) {
     caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-        group_, conv_out_spatial_dim_, kernel_dim_,
+        group_, conv_out_spatial_dim_, kernel_dim_ / group_,
         (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)0., output + output_offset_ * g);
   }
@@ -262,7 +177,7 @@ template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,
     const Dtype* bias) {
   caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-      out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),
+      height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),
       (Dtype)1., output);
 }
 
@@ -274,7 +189,7 @@ void BaseConvolutionLayer<Dtype>::backward_cpu_gemm(const Dtype* output,
     col_buff = input;
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_,
+    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
         conv_out_spatial_dim_, conv_out_channels_ / group_,
         (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
         (Dtype)0., col_buff + col_offset_ * g);
@@ -294,7 +209,7 @@ void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,
   }
   for (int g = 0; g < group_; ++g) {
     caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-        kernel_dim_, conv_out_spatial_dim_,
+        kernel_dim_ / group_, conv_out_spatial_dim_,
         (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)1., weights + weight_offset_ * g);
   }
@@ -303,7 +218,7 @@ void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias,
     const Dtype* input) {
-  caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, out_spatial_dim_, 1.,
+  caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
       input, bias_multiplier_.cpu_data(), 1., bias);
 }
 
@@ -321,7 +236,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
   }
   for (int g = 0; g < group_; ++g) {
     caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-        group_, conv_out_spatial_dim_, kernel_dim_,
+        group_, conv_out_spatial_dim_, kernel_dim_ / group_,
         (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)0., output + output_offset_ * g);
   }
@@ -331,7 +246,7 @@ template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
     const Dtype* bias) {
   caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-      out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
+      height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
       (Dtype)1., output);
 }
 
@@ -343,7 +258,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
     col_buff = input;
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_,
+    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
         conv_out_spatial_dim_, conv_out_channels_ / group_,
         (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
         (Dtype)0., col_buff + col_offset_ * g);
@@ -363,7 +278,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
   }
   for (int g = 0; g < group_; ++g) {
     caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-        kernel_dim_, conv_out_spatial_dim_,
+        kernel_dim_ / group_, conv_out_spatial_dim_,
         (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)1., weights + weight_offset_ * g);
   }
@@ -372,7 +287,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
     const Dtype* input) {
-  caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, out_spatial_dim_, 1.,
+  caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
       input, bias_multiplier_.gpu_data(), 1., bias);
 }
 
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 86b500de859..95fba105b9a 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -48,16 +48,11 @@ void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   }
   top[0]->Reshape(top_shape);
   CHECK_EQ(bottom_count_sum, top[0]->count());
-  if (bottom.size() == 1) {
-    top[0]->ShareData(*bottom[0]);
-    top[0]->ShareDiff(*bottom[0]);
-  }
 }
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  if (bottom.size() == 1) { return; }
   Dtype* top_data = top[0]->mutable_cpu_data();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
@@ -77,7 +72,6 @@ void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (bottom.size() == 1) { return; }
   const Dtype* top_diff = top[0]->cpu_diff();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
index 617701e2621..3c64c7ef224 100644
--- a/src/caffe/layers/concat_layer.cu
+++ b/src/caffe/layers/concat_layer.cu
@@ -28,7 +28,6 @@ __global__ void Concat(const int nthreads, const Dtype* in_data,
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  if (bottom.size() == 1) { return; }
   Dtype* top_data = top[0]->mutable_gpu_data();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
@@ -49,7 +48,6 @@ void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (bottom.size() == 1) { return; }
   const Dtype* top_diff = top[0]->gpu_diff();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index fb50bb095ed..928ef5ee468 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -10,17 +10,10 @@ namespace caffe {
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::compute_output_shape() {
-  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
-  const int* stride_data = this->stride_.cpu_data();
-  const int* pad_data = this->pad_.cpu_data();
-  this->output_shape_.clear();
-  for (int i = 0; i < this->num_spatial_axes_; ++i) {
-    // i + 1 to skip channel axis
-    const int input_dim = this->input_shape(i + 1);
-    const int output_dim = (input_dim + 2 * pad_data[i] - kernel_shape_data[i])
-        / stride_data[i] + 1;
-    this->output_shape_.push_back(output_dim);
-  }
+  this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
+      / this->stride_h_ + 1;
+  this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
+      / this->stride_w_ + 1;
 }
 
 template <typename Dtype>
@@ -31,11 +24,11 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->cpu_data();
     Dtype* top_data = top[i]->mutable_cpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->forward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight,
-          top_data + n * this->top_dim_);
+      this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+          top_data + top[i]->offset(n));
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->cpu_data();
-        this->forward_cpu_bias(top_data + n * this->top_dim_, bias);
+        this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
       }
     }
   }
@@ -54,20 +47,20 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
       for (int n = 0; n < this->num_; ++n) {
-        this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
+        this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       for (int n = 0; n < this->num_; ++n) {
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_,
-              top_diff + n * this->top_dim_, weight_diff);
+          this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n),
+              top_diff + top[i]->offset(n), weight_diff);
         }
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
-          this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight,
-              bottom_diff + n * this->bottom_dim_);
+          this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight,
+              bottom_diff + bottom[i]->offset(n));
         }
       }
     }
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
index b429d2b47d0..b8a98ff7cc9 100644
--- a/src/caffe/layers/conv_layer.cu
+++ b/src/caffe/layers/conv_layer.cu
@@ -16,11 +16,11 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->forward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,
-          top_data + n * this->top_dim_);
+      this->forward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+          top_data + top[i]->offset(n));
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + n * this->top_dim_, bias);
+        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
       }
     }
   }
@@ -37,7 +37,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
       for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + n * this->top_dim_);
+        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
@@ -46,13 +46,13 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       for (int n = 0; n < this->num_; ++n) {
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(bottom_data + n * this->bottom_dim_,
-              top_diff + n * this->top_dim_, weight_diff);
+          this->weight_gpu_gemm(bottom_data + bottom[i]->offset(n),
+              top_diff + top[i]->offset(n), weight_diff);
         }
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
-          this->backward_gpu_gemm(top_diff + n * this->top_dim_, weight,
-              bottom_diff + n * this->bottom_dim_);
+          this->backward_gpu_gemm(top_diff + top[i]->offset(n), weight,
+              bottom_diff + bottom[i]->offset(n));
         }
       }
     }
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 3514fe2aba5..104d2b9d669 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -34,15 +34,14 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
   }
 
   // Set the indexing parameters.
+  weight_offset_ = (this->num_output_ / this->group_)
+      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
   bias_offset_ = (this->num_output_ / this->group_);
 
   // Create filter descriptor.
-  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
-  const int kernel_h = kernel_shape_data[0];
-  const int kernel_w = kernel_shape_data[1];
   cudnn::createFilterDesc<Dtype>(&filter_desc_,
       this->num_output_ / this->group_, this->channels_ / this->group_,
-      kernel_h, kernel_w);
+      this->kernel_h_, this->kernel_w_);
 
   // Create tensor descriptor(s) for data and corresponding convolution(s).
   for (int i = 0; i < bottom.size(); i++) {
@@ -69,36 +68,29 @@ template <typename Dtype>
 void CuDNNConvolutionLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   ConvolutionLayer<Dtype>::Reshape(bottom, top);
-  CHECK_EQ(2, this->num_spatial_axes_)
-      << "CuDNNConvolution input must have 2 spatial axes "
-      << "(e.g., height and width). "
-      << "Use 'engine: CAFFE' for general ND convolution.";
-  bottom_offset_ = this->bottom_dim_ / this->group_;
-  top_offset_ = this->top_dim_ / this->group_;
-  const int height = bottom[0]->shape(this->channel_axis_ + 1);
-  const int width = bottom[0]->shape(this->channel_axis_ + 2);
-  const int height_out = top[0]->shape(this->channel_axis_ + 1);
-  const int width_out = top[0]->shape(this->channel_axis_ + 2);
-  const int* pad_data = this->pad_.cpu_data();
-  const int pad_h = pad_data[0];
-  const int pad_w = pad_data[1];
-  const int* stride_data = this->stride_.cpu_data();
-  const int stride_h = stride_data[0];
-  const int stride_w = stride_data[1];
+  bottom_offset_ = (this->channels_ / this->group_)
+      * this->height_ * this->width_;
+  top_offset_ = (this->num_output_ / this->group_)
+      * this->height_out_ * this->width_out_;
 
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
         this->num_,
-        this->channels_ / this->group_, height, width,
-        this->channels_ * height * width,
-        height * width, width, 1);
+        this->channels_ / this->group_,
+        this->height_, this->width_,
+        this->channels_ * this->height_ * this->width_,
+        this->height_ * this->width_,
+        this->width_, 1);
     cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
         this->num_,
-        this->num_output_ / this->group_, height_out, width_out,
-        this->num_output_ * this->out_spatial_dim_,
-        this->out_spatial_dim_, width_out, 1);
+        this->num_output_ / this->group_,
+        this->height_out_, this->width_out_,
+        this->num_output_ * this->height_out_ * this->width_out_,
+        this->height_out_ * this->width_out_,
+        this->width_out_, 1);
     cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
-        filter_desc_, pad_h, pad_w, stride_h, stride_w);
+        filter_desc_, this->pad_h_, this->pad_w_,
+        this->stride_h_, this->stride_w_);
   }
 
   // Tensor descriptor for bias.
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 691152021a3..b4e802e13d1 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -14,15 +14,15 @@ __global__ void sync_conv_groups() { }
 template <typename Dtype>
 void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
-  const int kernel_h = kernel_shape_data[0];
-  const int kernel_w = kernel_shape_data[1];
-  const size_t workspace_limit_bytes =
-      kernel_h * kernel_w * this->channels_ * sizeof(int) + 1;
-  const Dtype* weight = this->blobs_[0]->gpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
+    const Dtype* weight = this->blobs_[0]->gpu_data();
+
+    size_t workspace_limit_bytes = this->kernel_h_ *
+                                   this->kernel_w_ *
+                                   this->channels_ *
+                                   sizeof(int) + 1;
 
     // Forward through cuDNN in parallel over groups.
     for (int g = 0; g < this->group_; g++) {
@@ -69,7 +69,7 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
       CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
             cudnn::dataType<Dtype>::one,
             bottom_descs_[i], bottom_data + bottom_offset_ * g,
-            filter_desc_, weight + this->weight_offset_ * g,
+            filter_desc_, weight + weight_offset_ * g,
             conv_descs_[i],
             algo, workspace, workspaceSizeInBytes,
             cudnn::dataType<Dtype>::zero,
@@ -128,7 +128,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
               top_descs_[i],    top_diff + top_offset_ * g,
               conv_descs_[i],
               cudnn::dataType<Dtype>::one,
-              filter_desc_, weight_diff + this->weight_offset_ * g));
+              filter_desc_, weight_diff + weight_offset_ * g));
       }
 
       // Gradient w.r.t. bottom data.
@@ -139,7 +139,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
         CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g],
               cudnn::dataType<Dtype>::one,
-              filter_desc_, weight + this->weight_offset_ * g,
+              filter_desc_, weight + weight_offset_ * g,
               top_descs_[i], top_diff + top_offset_ * g,
               conv_descs_[i],
               cudnn::dataType<Dtype>::zero,
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 71f8cb099e8..0932d9feff3 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -1,6 +1,5 @@
-#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
-#endif  // USE_OPENCV
+
 #include <stdint.h>
 
 #include <string>
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 91aabb315b2..a4612963b6b 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -10,17 +10,10 @@ namespace caffe {
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::compute_output_shape() {
-  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
-  const int* stride_data = this->stride_.cpu_data();
-  const int* pad_data = this->pad_.cpu_data();
-  this->output_shape_.clear();
-  for (int i = 0; i < this->num_spatial_axes_; ++i) {
-    // i + 1 to skip channel axis
-    const int input_dim = this->input_shape(i + 1);
-    const int output_dim = stride_data[i] * (input_dim - 1)
-        + kernel_shape_data[i] - 2 * pad_data[i];
-    this->output_shape_.push_back(output_dim);
-  }
+  this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_
+      - 2 * this->pad_h_;
+  this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_
+      - 2 * this->pad_w_;
 }
 
 template <typename Dtype>
@@ -31,11 +24,11 @@ void DeconvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->cpu_data();
     Dtype* top_data = top[i]->mutable_cpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->backward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight,
-          top_data + n * this->top_dim_);
+      this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+          top_data + top[i]->offset(n));
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->cpu_data();
-        this->forward_cpu_bias(top_data + n * this->top_dim_, bias);
+        this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
       }
     }
   }
@@ -54,21 +47,21 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
       for (int n = 0; n < this->num_; ++n) {
-        this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
+        this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       for (int n = 0; n < this->num_; ++n) {
         // Gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          this->weight_cpu_gemm(top_diff + n * this->top_dim_,
-              bottom_data + n * this->bottom_dim_, weight_diff);
+          this->weight_cpu_gemm(top_diff + top[i]->offset(n),
+              bottom_data + bottom[i]->offset(n), weight_diff);
         }
         // Gradient w.r.t. bottom data, if necessary, reusing the column buffer
         // we might have just computed above.
         if (propagate_down[i]) {
-          this->forward_cpu_gemm(top_diff + n * this->top_dim_, weight,
-              bottom_diff + n * this->bottom_dim_,
+          this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight,
+              bottom_diff + bottom[i]->offset(n),
               this->param_propagate_down_[0]);
         }
       }
diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu
index 5dbdcc3149f..8a1eed8aa16 100644
--- a/src/caffe/layers/deconv_layer.cu
+++ b/src/caffe/layers/deconv_layer.cu
@@ -16,11 +16,11 @@ void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->backward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,
-          top_data + n * this->top_dim_);
+      this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+          top_data + top[i]->offset(n));
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + n * this->top_dim_, bias);
+        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
       }
     }
   }
@@ -39,20 +39,20 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
       for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + n * this->top_dim_);
+        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       for (int n = 0; n < this->num_; ++n) {
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(top_diff + n * this->top_dim_,
-              bottom_data + n * this->bottom_dim_, weight_diff);
+          this->weight_gpu_gemm(top_diff + top[i]->offset(n),
+              bottom_data + bottom[i]->offset(n), weight_diff);
         }
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
-          this->forward_gpu_gemm(top_diff + n * this->top_dim_, weight,
-              bottom_diff + n * this->bottom_dim_,
+          this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
+              bottom_diff + bottom[i]->offset(n),
               this->param_propagate_down_[0]);
         }
       }
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 595c9dbbe5e..1c802714e33 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -11,106 +11,54 @@ template <typename Dtype>
 void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-  force_nd_im2col_ = conv_param.force_nd_im2col();
-  const int input_num_dims = bottom[0]->shape().size();
-  channel_axis_ = bottom[0]->CanonicalAxisIndex(conv_param.axis());
-  const int first_spatial_dim = channel_axis_ + 1;
-  num_spatial_axes_ = input_num_dims - first_spatial_dim;
-  CHECK_GE(num_spatial_axes_, 1);
-  vector<int> dim_blob_shape(1, num_spatial_axes_);
-  // Setup filter kernel dimensions (kernel_shape_).
-  kernel_shape_.Reshape(dim_blob_shape);
-  int* kernel_shape_data = kernel_shape_.mutable_cpu_data();
-  if (conv_param.has_kernel_h() || conv_param.has_kernel_w()) {
-    CHECK_EQ(num_spatial_axes_, 2)
-        << "kernel_h & kernel_w can only be used for 2D convolution.";
-    CHECK_EQ(0, conv_param.kernel_size_size())
-        << "Either kernel_size or kernel_h/w should be specified; not both.";
-    kernel_shape_data[0] = conv_param.kernel_h();
-    kernel_shape_data[1] = conv_param.kernel_w();
+  CHECK(!conv_param.has_kernel_size() !=
+      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+  CHECK(conv_param.has_kernel_size() ||
+      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "For non-square filters both kernel_h and kernel_w are required.";
+  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+      && conv_param.has_pad_w())
+      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+      << "pad is pad OR pad_h and pad_w are required.";
+  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+      && conv_param.has_stride_w())
+      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+      << "Stride is stride OR stride_h and stride_w are required.";
+  if (conv_param.has_kernel_size()) {
+    kernel_h_ = kernel_w_ = conv_param.kernel_size();
   } else {
-    const int num_kernel_dims = conv_param.kernel_size_size();
-    CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_)
-        << "kernel_size must be specified once, or once per spatial dimension "
-        << "(kernel_size specified " << num_kernel_dims << " times; "
-        << num_spatial_axes_ << " spatial dims);";
-      for (int i = 0; i < num_spatial_axes_; ++i) {
-        kernel_shape_data[i] =
-            conv_param.kernel_size((num_kernel_dims == 1) ? 0 : i);
-      }
+    kernel_h_ = conv_param.kernel_h();
+    kernel_w_ = conv_param.kernel_w();
   }
-  for (int i = 0; i < num_spatial_axes_; ++i) {
-    CHECK_GT(kernel_shape_data[i], 0) << "Filter dimensions must be nonzero.";
-  }
-  // Setup stride dimensions (stride_).
-  stride_.Reshape(dim_blob_shape);
-  int* stride_data = stride_.mutable_cpu_data();
-  if (conv_param.has_stride_h() || conv_param.has_stride_w()) {
-    CHECK_EQ(num_spatial_axes_, 2)
-        << "stride_h & stride_w can only be used for 2D convolution.";
-    CHECK_EQ(0, conv_param.stride_size())
-        << "Either stride or stride_h/w should be specified; not both.";
-    stride_data[0] = conv_param.stride_h();
-    stride_data[1] = conv_param.stride_w();
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+  if (!conv_param.has_pad_h()) {
+    pad_h_ = pad_w_ = conv_param.pad();
   } else {
-    const int num_stride_dims = conv_param.stride_size();
-    CHECK(num_stride_dims == 0 || num_stride_dims == 1 ||
-          num_stride_dims == num_spatial_axes_)
-        << "stride must be specified once, or once per spatial dimension "
-        << "(stride specified " << num_stride_dims << " times; "
-        << num_spatial_axes_ << " spatial dims);";
-    const int kDefaultStride = 1;
-    for (int i = 0; i < num_spatial_axes_; ++i) {
-      stride_data[i] = (num_stride_dims == 0) ? kDefaultStride :
-          conv_param.stride((num_stride_dims == 1) ? 0 : i);
-      CHECK_GT(stride_data[i], 0) << "Stride dimensions must be nonzero.";
-    }
+    pad_h_ = conv_param.pad_h();
+    pad_w_ = conv_param.pad_w();
   }
-  // Setup pad dimensions (pad_).
-  pad_.Reshape(dim_blob_shape);
-  int* pad_data = pad_.mutable_cpu_data();
-  if (conv_param.has_pad_h() || conv_param.has_pad_w()) {
-    CHECK_EQ(num_spatial_axes_, 2)
-        << "pad_h & pad_w can only be used for 2D convolution.";
-    CHECK_EQ(0, conv_param.pad_size())
-        << "Either pad or pad_h/w should be specified; not both.";
-    pad_data[0] = conv_param.pad_h();
-    pad_data[1] = conv_param.pad_w();
+  if (!conv_param.has_stride_h()) {
+    stride_h_ = stride_w_ = conv_param.stride();
   } else {
-    const int num_pad_dims = conv_param.pad_size();
-    CHECK(num_pad_dims == 0 || num_pad_dims == 1 ||
-          num_pad_dims == num_spatial_axes_)
-        << "pad must be specified once, or once per spatial dimension "
-        << "(pad specified " << num_pad_dims << " times; "
-        << num_spatial_axes_ << " spatial dims);";
-    const int kDefaultPad = 0;
-    for (int i = 0; i < num_spatial_axes_; ++i) {
-      pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
-          conv_param.pad((num_pad_dims == 1) ? 0 : i);
-    }
+    stride_h_ = conv_param.stride_h();
+    stride_w_ = conv_param.stride_w();
   }
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  vector<int> top_shape = bottom[0]->shape();
-  const int* kernel_shape_data = kernel_shape_.cpu_data();
-  const int* stride_data = stride_.cpu_data();
-  const int* pad_data = pad_.cpu_data();
-  for (int i = 0; i < num_spatial_axes_; ++i) {
-    top_shape[channel_axis_] *= kernel_shape_data[i];
-    const int input_dim = bottom[0]->shape(channel_axis_ + i + 1);
-    const int output_dim = (input_dim + 2 * pad_data[i] - kernel_shape_data[i])
-        / stride_data[i] + 1;
-    top_shape[channel_axis_ + i + 1] = output_dim;
-  }
-  top[0]->Reshape(top_shape);
-  num_ = bottom[0]->count(0, channel_axis_);
-  bottom_dim_ = bottom[0]->count(channel_axis_);
-  top_dim_ = top[0]->count(channel_axis_);
-
-  channels_ = bottom[0]->shape(channel_axis_);
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  top[0]->Reshape(
+      bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
+      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
+      (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
 }
 
 template <typename Dtype>
@@ -118,27 +66,10 @@ void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  for (int n = 0; n < num_; ++n) {
-    DCHECK_EQ(bottom[0]->shape().size() - channel_axis_, num_spatial_axes_ + 1);
-    DCHECK_EQ(top[0]->shape().size() - channel_axis_, num_spatial_axes_ + 1);
-    DCHECK_EQ(kernel_shape_.count(), num_spatial_axes_);
-    DCHECK_EQ(pad_.count(), num_spatial_axes_);
-    DCHECK_EQ(stride_.count(), num_spatial_axes_);
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      im2col_cpu(bottom_data + n * bottom_dim_, channels_,
-          bottom[0]->shape(channel_axis_ + 1),
-          bottom[0]->shape(channel_axis_ + 2),
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1],
-          top_data + n * top_dim_);
-    } else {
-      im2col_nd_cpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
-          bottom[0]->shape().data() + channel_axis_,
-          top[0]->shape().data() + channel_axis_,
-          kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(),
-          top_data + n * top_dim_);
-    }
+  for (int n = 0; n < bottom[0]->num(); ++n) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, top_data + top[0]->offset(n));
   }
 }
 
@@ -147,22 +78,10 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  for (int n = 0; n < num_; ++n) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      col2im_cpu(top_diff + n * top_dim_, channels_,
-          bottom[0]->shape(channel_axis_ + 1),
-          bottom[0]->shape(channel_axis_ + 2),
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1],
-          bottom_diff + n * bottom_dim_);
-    } else {
-      col2im_nd_cpu(top_diff + n * top_dim_, num_spatial_axes_,
-          bottom[0]->shape().data() + channel_axis_,
-          top[0]->shape().data() + channel_axis_,
-          kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(),
-          bottom_diff + n * bottom_dim_);
-    }
+  for (int n = 0; n < top[0]->num(); ++n) {
+    col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
   }
 }
 
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
index cd507623c78..9c338b14cb7 100644
--- a/src/caffe/layers/im2col_layer.cu
+++ b/src/caffe/layers/im2col_layer.cu
@@ -12,23 +12,10 @@ void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  const int num_kernels = channels_ * top[0]->count(channel_axis_ + 1);
-  for (int n = 0; n < num_; ++n) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      im2col_gpu(bottom_data + n * bottom_dim_, channels_,
-          bottom[0]->shape(channel_axis_ + 1),
-          bottom[0]->shape(channel_axis_ + 2),
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1],
-          top_data + n * top_dim_);
-    } else {
-      im2col_nd_gpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
-          num_kernels, bottom[0]->gpu_shape() + channel_axis_,
-          top[0]->gpu_shape() + channel_axis_,
-          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
-          top_data + n * top_dim_);
-    }
+  for (int n = 0; n < bottom[0]->num(); ++n) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, top_data + top[0]->offset(n));
   }
 }
 
@@ -37,22 +24,10 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int n = 0; n < num_; ++n) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      col2im_gpu(top_diff + n * top_dim_, channels_,
-          bottom[0]->shape(channel_axis_ + 1),
-          bottom[0]->shape(channel_axis_ + 2),
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1],
-          bottom_diff + n * bottom_dim_);
-    } else {
-      col2im_nd_gpu(top_diff + n * top_dim_, num_spatial_axes_, bottom_dim_,
-          bottom[0]->gpu_shape() + channel_axis_,
-          top[0]->gpu_shape() + channel_axis_,
-          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
-          bottom_diff + n * bottom_dim_);
-    }
+  for (int n = 0; n < top[0]->num(); ++n) {
+    col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
   }
 }
 
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 3d2190f8bbb..223ba3a75ca 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
 
 #include <fstream>  // NOLINT(readability/streams)
@@ -165,4 +164,3 @@ INSTANTIATE_CLASS(ImageDataLayer);
 REGISTER_LAYER_CLASS(ImageData);
 
 }  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index 2370aa04d3b..42de4198bc4 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -1,6 +1,4 @@
-#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
-#endif  // USE_OPENCV
 
 #include <vector>
 
@@ -55,7 +53,6 @@ void MemoryDataLayer<Dtype>::AddDatumVector(const vector<Datum>& datum_vector) {
   has_new_data_ = true;
 }
 
-#ifdef USE_OPENCV
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
     const vector<int>& labels) {
@@ -79,7 +76,6 @@ void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
   Reset(top_data, top_label, num);
   has_new_data_ = true;
 }
-#endif  // USE_OPENCV
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index 0a059ae88fe..e4418c9cf9c 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -67,16 +67,11 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     }
   }
   CHECK_EQ(count, bottom[0]->count());
-  if (top.size() == 1) {
-    top[0]->ShareData(*bottom[0]);
-    top[0]->ShareDiff(*bottom[0]);
-  }
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  if (top.size() == 1) { return; }
   int offset_slice_axis = 0;
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
@@ -97,7 +92,7 @@ void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0] || top.size() == 1) { return; }
+  if (!propagate_down[0]) { return; }
   int offset_slice_axis = 0;
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/slice_layer.cu
index e8dc6cd98fc..796841d3f52 100644
--- a/src/caffe/layers/slice_layer.cu
+++ b/src/caffe/layers/slice_layer.cu
@@ -28,7 +28,6 @@ __global__ void Slice(const int nthreads, const Dtype* in_data,
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  if (top.size() == 1) { return; }
   int offset_slice_axis = 0;
   const Dtype* bottom_data = bottom[0]->gpu_data();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
@@ -49,7 +48,7 @@ void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0] || top.size() == 1) { return; }
+  if (!propagate_down[0]) { return; }
   int offset_slice_axis = 0;
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
diff --git a/src/caffe/layers/triplet_loss_layer.cpp b/src/caffe/layers/triplet_loss_layer.cpp
new file mode 100644
index 00000000000..7d11ff59ce4
--- /dev/null
+++ b/src/caffe/layers/triplet_loss_layer.cpp
@@ -0,0 +1,397 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/loss_layers.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void TripletLossLayer<Dtype>::LayerSetUp(
+  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  LossLayer<Dtype>::LayerSetUp(bottom, top);
+  // number of triplet in a batch
+  int num_triplets = this->layer_param_.triplet_loss_param().num_triplets();
+  // dimension of each descriptor
+  int dim = bottom[0]->count()/bottom[0]->num();
+  CHECK_EQ(bottom[0]->channels(), dim);
+  CHECK_EQ(bottom[0]->height(), 1);
+  CHECK_EQ(bottom[0]->width(), 1);
+  CHECK_EQ(bottom[1]->channels(), 1);
+  CHECK_EQ(bottom[1]->height(), 1);
+  CHECK_EQ(bottom[1]->width(), 1);
+  // In each set, we have:
+  // the descriptor of reference sample, closest sample, and negative samples
+  // number of sets in the whole batch
+  int num_set = bottom[0]->num()/(2 + num_triplets);
+  dist_sq_.Reshape(num_set, 1, 1, 1);
+  diff_pos.Reshape(num_set, dim, 1, 1);
+  dist_sq_pos.Reshape(num_set, 1, 1, 1);
+  diff_neg.Reshape(num_set, dim, 1, 1);
+  dist_sq_neg.Reshape(num_set, 1, 1, 1);
+  // vector of ones used to sum along channels
+  summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1);
+  for (int i = 0; i < bottom[0]->channels(); ++i)
+    summer_vec_.mutable_cpu_data()[i] = Dtype(1);
+}
+
+template <typename Dtype>
+void TripletLossLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  Dtype margin = this->layer_param_.triplet_loss_param().margin();
+  Dtype losstype = this->layer_param_.triplet_loss_param().losstype();
+  int num_triplets = this->layer_param_.triplet_loss_param().num_triplets();
+  CHECK_EQ(bottom[0]->num()%(2 + num_triplets), 0);
+  Dtype loss(0.0);
+  int dim = bottom[0]->count()/bottom[0]->num();
+  int num_set = bottom[0]->num()/(2 + num_triplets);
+  if (losstype == 0) {
+  for (int i = 0; i < num_set; ++i) {
+    caffe_sub(
+        dim,
+        bottom[0]->cpu_data() + (2 + num_triplets)*i*dim,  // reference
+        bottom[0]->cpu_data() + ((2 + num_triplets)*i + 1)*dim,  // positive
+        diff_pos.mutable_cpu_data() + i*dim);  // reference-pose_close
+    // Loss component calculated from reference and close one
+    dist_sq_pos.mutable_cpu_data()[i] = caffe_cpu_dot(dim,
+        diff_pos.cpu_data() + i*dim, diff_pos.cpu_data() + i*dim);
+    // a b is a similar pair for pair wise
+    // loss accumulated by the pair wise part
+    loss += dist_sq_pos.cpu_data()[i];
+    for (int triplet = 0; triplet < num_triplets; ++triplet) {
+      // Triplet loss accumulation
+      // a and negative[triplet] is a similar pair for triplet
+      dist_sq_.mutable_cpu_data()[i] = dist_sq_pos.cpu_data()[i];
+      // Loss component calculated from negative part
+      caffe_sub(
+          dim,
+          bottom[0]->cpu_data() + (2 + num_triplets)*i*dim,  // reference
+          bottom[0]->cpu_data() + ((2 + num_triplets)*i + 2 + triplet)*dim,
+          diff_neg.mutable_cpu_data() + i*dim);  // reference-negative
+      dist_sq_neg.mutable_cpu_data()[i] = caffe_cpu_dot(dim,
+          diff_neg.cpu_data() + i*dim, diff_neg.cpu_data() + i*dim);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[i] -= dist_sq_neg.cpu_data()[i];
+      // loss accumulated accumulated by the triplet part
+      loss += std::max(margin + dist_sq_.cpu_data()[i], Dtype(0.0));
+    }
+  }
+  loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
+  } else {
+  for (int i = 0; i < num_set; ++i) {
+    caffe_sub(
+        dim,
+        bottom[0]->cpu_data() + (2 + num_triplets)*i*dim,  // reference
+        bottom[0]->cpu_data() + ((2 + num_triplets)*i + 1)*dim,  // positive
+        diff_pos.mutable_cpu_data() + i*dim);  // reference-pose_close
+    // Loss component calculated from reference and close one
+    dist_sq_pos.mutable_cpu_data()[i] = caffe_cpu_dot(dim,
+        diff_pos.cpu_data() + i*dim, diff_pos.cpu_data() + i*dim);
+    // a b is a similar pair for pair wise
+    // loss accumulated by the pair wise part
+    loss += dist_sq_pos.cpu_data()[i];
+    for (int triplet = 0; triplet < num_triplets; ++triplet) {
+      dist_sq_.mutable_cpu_data()[i] = dist_sq_pos.mutable_cpu_data()[i];
+      dist_sq_.mutable_cpu_data()[i] += margin;
+      // Loss component calculated from negative part
+      caffe_sub(
+          dim,
+          bottom[0]->cpu_data() + (2 + num_triplets)*i*dim,  // reference
+          bottom[0]->cpu_data() + ((2 + num_triplets)*i + 2 + triplet)*dim,
+          diff_neg.mutable_cpu_data() + i*dim);  // reference-negative
+      dist_sq_neg.mutable_cpu_data()[i] = caffe_cpu_dot(dim,
+          diff_neg.cpu_data() + i*dim, diff_neg.cpu_data() + i*dim);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[i] = 1 - \
+        dist_sq_neg.cpu_data()[i] / dist_sq_.mutable_cpu_data()[i];
+      // loss accumulated accumulated by the triplet part
+      loss += std::max(dist_sq_.cpu_data()[i], Dtype(0.0));
+    }
+  }
+  loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
+  }
+}
+
+template <typename Dtype>
+void TripletLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  Dtype margin = this->layer_param_.triplet_loss_param().margin();
+  Dtype losstype = this->layer_param_.triplet_loss_param().losstype();
+  int num_triplets = this->layer_param_.triplet_loss_param().num_triplets();
+  int dim = bottom[0]->count()/bottom[0]->num();
+  int num_set = bottom[0]->num()/(2 + num_triplets);
+  if (losstype == 0) {
+  // BP for feat1(extracted from reference)
+  for (int i = 0; i < 1; ++i) {
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_cpu_diff();
+        // the pair part
+        caffe_cpu_axpby(
+            dim,
+            alpha,
+            diff_pos.cpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_triplets)*j + i)*dim);
+        // the num_triplets triplet part
+        for (int triplet = 0; triplet < num_triplets; ++triplet) {
+          caffe_sub(
+              dim,
+              bottom[0]->cpu_data() + (2 + num_triplets)*j*dim,  // reference
+              bottom[0]->cpu_data() + ((2 + num_triplets)*j + 2 + triplet)*dim,
+              diff_neg.mutable_cpu_data() + j*dim);  // reference-negative
+          // Triplet loss accumulation
+          // a and negative[triplet] is a similar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+          dist_sq_neg.mutable_cpu_data()[j] = caffe_cpu_dot(dim,
+              diff_neg.cpu_data() + j*dim, diff_neg.cpu_data() + j*dim);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] -= dist_sq_neg.cpu_data()[j];
+          // Loss component calculated from negative part
+          if ((margin + dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            // similar pair in triplet
+            caffe_cpu_axpby(
+                dim,
+                alpha,
+                diff_pos.cpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + (2 + num_triplets)*j*dim);
+            // dissimilar pair in triplet
+            caffe_cpu_axpby(
+                dim,
+                -alpha,
+                diff_neg.cpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  // BP for feat2(extracted from the closest sample)
+  for (int i = 1; i < 2; ++i) {
+    if (propagate_down[0]) {
+      const Dtype sign = -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_cpu_diff();
+        // the pair part
+        caffe_cpu_axpby(
+            dim,
+            alpha,
+            diff_pos.cpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_triplets)*j + i)*dim);
+        // the num_triplets triplet part
+        for (int triplet = 0; triplet < num_triplets; ++triplet) {
+          caffe_sub(
+              dim,
+              bottom[0]->cpu_data() + (2 + num_triplets)*j*dim,  // reference
+              bottom[0]->cpu_data() + ((2 + num_triplets)*j + 2 + triplet)*dim,
+              diff_neg.mutable_cpu_data() + j*dim);  // reference-negative
+          // Triplet loss accumulation
+          // a and negative[triplet] is a similar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+          dist_sq_neg.mutable_cpu_data()[j] = caffe_cpu_dot(dim,
+              diff_neg.cpu_data() + j*dim, diff_neg.cpu_data() + j*dim);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] -= dist_sq_neg.cpu_data()[j];
+          if ((margin + dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            // similar pair in triplet
+            caffe_cpu_axpby(
+                dim,
+                alpha,
+                diff_pos.cpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  // BP for negative feature used in the num_triplets triplet part
+  for (int i = 2; i < 2 + num_triplets; ++i) {
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_cpu_diff();
+        caffe_sub(
+            dim,
+            bottom[0]->cpu_data() + (2 + num_triplets)*j*dim,  // reference
+            bottom[0]->cpu_data() + ((2 + num_triplets)*j + i)*dim,
+            diff_neg.mutable_cpu_data() + j*dim);  // reference-negative
+        // Triplet loss accumulation
+        // a and negative[triplet] is a similar pair for triplet
+        dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+        dist_sq_neg.mutable_cpu_data()[j] = caffe_cpu_dot(dim,
+            diff_neg.cpu_data() + j*dim, diff_neg.cpu_data() + j*dim);
+        // a and negative[triplet] is a dissimilar pair for triplet
+        dist_sq_.mutable_cpu_data()[j] -= dist_sq_neg.cpu_data()[j];
+        if ((margin + dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+          // dissimilar pairs
+          caffe_cpu_axpby(
+              dim,
+              alpha,
+              diff_neg.cpu_data() + (j*dim),
+              Dtype(0.0),
+              bout + ((2 + num_triplets)*j + i)*dim);
+        } else {
+            caffe_set(dim, Dtype(0), bout + ((2 + num_triplets)*j + i)*dim);
+        }
+      }
+    }
+  }
+  } else {
+  for (int i = 0; i < 1; ++i) {
+    // BP for data1(feat1)
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_cpu_diff();
+        // the pair part
+        caffe_cpu_axpby(
+            dim,
+            alpha,
+            diff_pos.cpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_triplets)*j + i)*dim);
+        // the num_triplets triplet part
+        for (int triplet = 0; triplet < num_triplets; ++triplet) {
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.mutable_cpu_data()[j];
+          dist_sq_.mutable_cpu_data()[j] += margin;
+          // Loss component calculated from negative part
+          caffe_sub(
+              dim,
+              bottom[0]->cpu_data() + (2 + num_triplets)*j*dim,  // reference
+              bottom[0]->cpu_data() + ((2 + num_triplets)*j + 2 + triplet)*dim,
+              diff_neg.mutable_cpu_data() + j*dim);  // reference-negative
+          dist_sq_neg.mutable_cpu_data()[j] = caffe_cpu_dot(dim,
+              diff_neg.cpu_data() + j*dim, diff_neg.cpu_data() + j*dim);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = 1 - \
+            dist_sq_neg.cpu_data()[j] / dist_sq_.mutable_cpu_data()[j];
+          // loss accumulated accumulated by the triplet part
+          if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            caffe_cpu_axpby(
+                dim,
+                alpha*dist_sq_neg.mutable_cpu_data()[j]\
+  /((dist_sq_pos.mutable_cpu_data()[j]+margin)\
+  *(dist_sq_pos.mutable_cpu_data()[j]+margin)),
+                diff_pos.cpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+            caffe_cpu_axpby(
+                dim,
+                -alpha/(dist_sq_pos.mutable_cpu_data()[j] + margin),
+                diff_neg.cpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  for (int i = 1; i < 2; ++i) {
+    // BP for positive data(feat2)
+    if (propagate_down[0]) {
+      const Dtype sign = -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_cpu_diff();
+        // the pair part
+        caffe_cpu_axpby(
+            dim,
+            alpha,
+            diff_pos.cpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_triplets)*j + i)*dim);
+        // the num_triplets triplet part
+        for (int triplet = 0; triplet < num_triplets; ++triplet) {
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.mutable_cpu_data()[j];
+          dist_sq_.mutable_cpu_data()[j] += margin;
+          // Loss component calculated from negative part
+          caffe_sub(
+              dim,
+              bottom[0]->cpu_data() + (2 + num_triplets)*j*dim,  // reference
+              bottom[0]->cpu_data() + ((2 + num_triplets)*j + 2 + triplet)*dim,
+              diff_neg.mutable_cpu_data() + j*dim);  // reference-negative
+          dist_sq_neg.mutable_cpu_data()[j] = caffe_cpu_dot(dim,
+              diff_neg.cpu_data() + j*dim, diff_neg.cpu_data() + j*dim);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = 1 - \
+            dist_sq_neg.cpu_data()[j] / dist_sq_.mutable_cpu_data()[j];
+          // loss accumulated accumulated by the triplet part
+          if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            caffe_cpu_axpby(
+                dim,
+                alpha*dist_sq_neg.mutable_cpu_data()[j]\
+  /((dist_sq_pos.mutable_cpu_data()[j]+margin)\
+  *(dist_sq_pos.mutable_cpu_data()[j]+margin)),
+                diff_pos.cpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  for (int i = 2; i < 2 + num_triplets; ++i) {
+    // BP for negative data(feat3)
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_cpu_diff();
+        dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.mutable_cpu_data()[j];
+        dist_sq_.mutable_cpu_data()[j] += margin;
+        // Loss component calculated from negative part
+        caffe_sub(
+            dim,
+            bottom[0]->cpu_data() + (2 + num_triplets)*j*dim,  // reference
+            bottom[0]->cpu_data() + ((2 + num_triplets)*j + i)*dim,
+            diff_neg.mutable_cpu_data() + j*dim);  // reference-negative
+        dist_sq_neg.mutable_cpu_data()[j] = caffe_cpu_dot(dim,
+            diff_neg.cpu_data() + j*dim, diff_neg.cpu_data() + j*dim);
+        // a and negative[triplet] is a dissimilar pair for triplet
+        dist_sq_.mutable_cpu_data()[j] = 1 - \
+          dist_sq_neg.cpu_data()[j] / dist_sq_.mutable_cpu_data()[j];
+        // loss accumulated accumulated by the triplet part
+        if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+          caffe_cpu_axpby(
+              dim,
+              alpha/(dist_sq_pos.mutable_cpu_data()[j] + margin),
+              diff_neg.cpu_data() + (j*dim),
+              Dtype(0.0),
+              bout + ((2 + num_triplets)*j + i)*dim);
+        } else {
+            caffe_set(dim, Dtype(0), bout + ((2 + num_triplets)*j + i)*dim);
+        }
+      }
+    }
+  }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(TripletLossLayer);
+#endif
+
+INSTANTIATE_CLASS(TripletLossLayer);
+REGISTER_LAYER_CLASS(TripletLoss);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/triplet_loss_layer.cu b/src/caffe/layers/triplet_loss_layer.cu
new file mode 100644
index 00000000000..cd1fbb1201b
--- /dev/null
+++ b/src/caffe/layers/triplet_loss_layer.cu
@@ -0,0 +1,392 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void TripletLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  Dtype margin = this->layer_param_.triplet_loss_param().margin();
+  Dtype losstype = this->layer_param_.triplet_loss_param().losstype();
+  int num_triplets = this->layer_param_.triplet_loss_param().num_triplets();
+  CHECK_EQ(bottom[0]->num()%(2 + num_triplets), 0);
+  Dtype loss(0.0);
+  int dim = bottom[0]->count()/bottom[0]->num();
+  int num_set = bottom[0]->num()/(2 + num_triplets);
+  if (losstype == 0) {
+  for (int i = 0; i < num_set; ++i) {
+    caffe_gpu_sub(
+        dim,
+        bottom[0]->gpu_data() + (2 + num_triplets)*i*dim,  // reference
+        bottom[0]->gpu_data() + ((2 + num_triplets)*i + 1)*dim,  // positive
+        diff_pos.mutable_gpu_data() + i*dim);  // reference-pose_close
+    caffe_gpu_dot(
+        dim,
+        diff_pos.gpu_data() + i*dim,
+        diff_pos.gpu_data() + i*dim,
+        dist_sq_pos.mutable_cpu_data() + i);
+    // a b is a similar pair for pair wise
+    // loss accumulated by the pair wise part
+    loss += dist_sq_pos.gpu_data()[i];
+    for (int triplet = 0; triplet < num_triplets; ++triplet) {
+      // Triplet loss accumulation
+      // a and negative[triplet] is a similar pair for triplet
+      dist_sq_.mutable_gpu_data()[i] = dist_sq_pos.gpu_data()[i];
+      // Loss component calculated from negative part
+      caffe_gpu_sub(
+          dim,
+          bottom[0]->gpu_data() + (2 + num_triplets)*i*dim,  // reference
+          bottom[0]->gpu_data() + ((2 + num_triplets)*i + 2 + triplet)*dim,
+          diff_neg.mutable_gpu_data() + i*dim);  // reference-negative
+      caffe_gpu_dot(
+          dim,
+          diff_neg.gpu_data() + i*dim,
+          diff_neg.gpu_data() + i*dim,
+          dist_sq_neg.mutable_cpu_data() + i);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_gpu_data()[i] -= dist_sq_neg.gpu_data()[i];
+      // loss accumulated accumulated by the triplet part
+      loss += std::max(margin + dist_sq_.gpu_data()[i], Dtype(0.0));
+    }
+  }
+  loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+  top[0]->mutable_gpu_data()[0] = loss;
+  } else {
+  for (int i = 0; i < num_set; ++i) {
+    caffe_gpu_sub(
+        dim,
+        bottom[0]->gpu_data() + (2 + num_triplets)*i*dim,  // reference
+        bottom[0]->gpu_data() + ((2 + num_triplets)*i + 1)*dim,  // positive
+        diff_pos.mutable_gpu_data() + i*dim);  // reference-pose_close
+    // Loss component calculated from reference and close one
+    caffe_gpu_dot(
+        dim,
+        diff_pos.gpu_data() + i*dim,
+        diff_pos.gpu_data() + i*dim,
+        dist_sq_pos.mutable_cpu_data() + i);
+    // a b is a similar pair for pair wise
+    // loss accumulated by the pair wise part
+    loss += dist_sq_pos.gpu_data()[i];
+    for (int triplet = 0; triplet < num_triplets; ++triplet) {
+      dist_sq_.mutable_gpu_data()[i] = dist_sq_pos.mutable_gpu_data()[i];
+      dist_sq_.mutable_gpu_data()[i] += margin;
+      // Loss component calculated from negative part
+      caffe_gpu_sub(
+          dim,
+          bottom[0]->gpu_data() + (2 + num_triplets)*i*dim,  // reference
+          bottom[0]->gpu_data() + ((2 + num_triplets)*i + 2 + triplet)*dim,
+          diff_neg.mutable_gpu_data() + i*dim);  // reference-negative
+      caffe_gpu_dot(
+          dim,
+          diff_neg.gpu_data() + i*dim,
+          diff_neg.gpu_data() + i*dim,
+          dist_sq_neg.mutable_cpu_data() + i);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_gpu_data()[i] = 1 - \
+        dist_sq_neg.gpu_data()[i] / dist_sq_.mutable_gpu_data()[i];
+      // loss accumulated accumulated by the triplet part
+      loss += std::max(dist_sq_.gpu_data()[i], Dtype(0.0));
+    }
+  }
+  loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+  top[0]->mutable_gpu_data()[0] = loss;
+  }
+}
+
+template <typename Dtype>
+void TripletLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  Dtype margin = this->layer_param_.triplet_loss_param().margin();
+  Dtype losstype = this->layer_param_.triplet_loss_param().losstype();
+  int num_triplets = this->layer_param_.triplet_loss_param().num_triplets();
+  int dim = bottom[0]->count()/bottom[0]->num();
+  int num_set = bottom[0]->num()/(2 + num_triplets);
+  if (losstype == 0) {
+  // BP for feat1(extracted from reference)
+  for (int i = 0; i < 1; ++i) {
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->gpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+        // the pair part
+        caffe_gpu_axpby(
+            dim,
+            alpha,
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_triplets)*j + i)*dim);
+        // the num_triplets triplet part
+        for (int triplet = 0; triplet < num_triplets; ++triplet) {
+          caffe_gpu_sub(
+              dim,
+              bottom[0]->gpu_data() + (2 + num_triplets)*j*dim,  // reference
+              bottom[0]->gpu_data() + ((2 + num_triplets)*j + 2 + triplet)*dim,
+              diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+          caffe_gpu_dot(
+              dim,
+              diff_neg.gpu_data() + j*dim,
+              diff_neg.gpu_data() + j*dim,
+              dist_sq_neg.mutable_cpu_data() + j);
+          // Triplet loss accumulation
+          // a and negative[triplet] is a similar pair for triplet
+          dist_sq_.mutable_gpu_data()[j] = dist_sq_pos.gpu_data()[j];
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_gpu_data()[j] -= dist_sq_neg.gpu_data()[j];
+          // Loss component calculated from negative part
+          if ((margin + dist_sq_.gpu_data()[j]) > Dtype(0.0)) {
+            // similar pair in triplet
+            caffe_gpu_axpby(
+                dim,
+                alpha,
+                diff_pos.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + (2 + num_triplets)*j*dim);
+            // dissimilar pair in triplet
+            caffe_gpu_axpby(
+                dim,
+                -alpha,
+                diff_neg.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  // BP for feat2(extracted from the closest sample)
+  for (int i = 1; i < 2; ++i) {
+    if (propagate_down[0]) {
+      const Dtype sign = -1;
+      const Dtype alpha = sign * top[0]->gpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+        // the pair part
+        caffe_gpu_axpby(
+            dim,
+            alpha,
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_triplets)*j + i)*dim);
+        // the num_triplets triplet part
+        for (int triplet = 0; triplet < num_triplets; ++triplet) {
+          caffe_gpu_sub(
+              dim,
+              bottom[0]->gpu_data() + (2 + num_triplets)*j*dim,  // reference
+              bottom[0]->gpu_data() + ((2 + num_triplets)*j + 2 + triplet)*dim,
+              diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+          // Triplet loss accumulation
+          // a and negative[triplet] is a similar pair for triplet
+          dist_sq_.mutable_gpu_data()[j] = dist_sq_pos.gpu_data()[j];
+          caffe_gpu_dot(
+              dim,
+              diff_neg.gpu_data() + j*dim,
+              diff_neg.gpu_data() + j*dim,
+              dist_sq_neg.mutable_cpu_data() + j);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_gpu_data()[j] -= dist_sq_neg.gpu_data()[j];
+          if ((margin + dist_sq_.gpu_data()[j]) > Dtype(0.0)) {
+            // similar pair in triplet
+            caffe_gpu_axpby(
+                dim,
+                alpha,
+                diff_pos.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  // BP for negative feature used in the num_triplets triplet part
+  for (int i = 2; i < 2 + num_triplets; ++i) {
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->gpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+        caffe_gpu_sub(
+            dim,
+            bottom[0]->gpu_data() + (2 + num_triplets)*j*dim,  // reference
+            bottom[0]->gpu_data() + ((2 + num_triplets)*j + i)*dim,
+            diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+        // Triplet loss accumulation
+        // a and negative[triplet] is a similar pair for triplet
+        dist_sq_.mutable_gpu_data()[j] = dist_sq_pos.gpu_data()[j];
+        caffe_gpu_dot(
+            dim,
+            diff_neg.gpu_data() + j*dim,
+            diff_neg.gpu_data() + j*dim,
+            dist_sq_neg.mutable_cpu_data() + j);
+        // a and negative[triplet] is a dissimilar pair for triplet
+        dist_sq_.mutable_gpu_data()[j] -= dist_sq_neg.gpu_data()[j];
+        if ((margin + dist_sq_.gpu_data()[j]) > Dtype(0.0)) {
+          // dissimilar pairs
+          caffe_gpu_axpby(
+              dim,
+              alpha,
+              diff_neg.gpu_data() + (j*dim),
+              Dtype(0.0),
+              bout + ((2 + num_triplets)*j + i)*dim);
+        } else {
+            caffe_gpu_set(dim, Dtype(0), bout + ((2 + num_triplets)*j + i)*dim);
+        }
+      }
+    }
+  }
+  } else {
+  for (int i = 0; i < 1; ++i) {
+    // BP for data1(feat1)
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->gpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+        // the pair part
+        caffe_gpu_axpby(
+            dim,
+            alpha,
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_triplets)*j + i)*dim);
+        // the num_triplets triplet part
+        for (int triplet = 0; triplet < num_triplets; ++triplet) {
+          dist_sq_.mutable_gpu_data()[j] = dist_sq_pos.mutable_gpu_data()[j];
+          dist_sq_.mutable_gpu_data()[j] += margin;
+          // Loss component calculated from negative part
+          caffe_gpu_sub(
+              dim,
+              bottom[0]->gpu_data() + (2 + num_triplets)*j*dim,  // reference
+              bottom[0]->gpu_data() + ((2 + num_triplets)*j + 2 + triplet)*dim,
+              diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+          caffe_gpu_dot(
+              dim,
+              diff_neg.gpu_data() + j*dim,
+              diff_neg.gpu_data() + j*dim,
+              dist_sq_neg.mutable_cpu_data() + j);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_gpu_data()[j] = 1 - \
+            dist_sq_neg.gpu_data()[j] / dist_sq_.mutable_gpu_data()[j];
+          // loss accumulated accumulated by the triplet part
+          if ((dist_sq_.gpu_data()[j]) > Dtype(0.0)) {
+            caffe_gpu_axpby(
+                dim,
+                alpha*dist_sq_neg.mutable_gpu_data()[j]\
+  /((dist_sq_pos.mutable_gpu_data()[j]+margin)\
+  *(dist_sq_pos.mutable_gpu_data()[j]+margin)),
+                diff_pos.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+            caffe_gpu_axpby(
+                dim,
+                -alpha/(dist_sq_pos.mutable_gpu_data()[j] + margin),
+                diff_neg.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  for (int i = 1; i < 2; ++i) {
+    // BP for positive data(feat2)
+    if (propagate_down[0]) {
+      const Dtype sign = -1;
+      const Dtype alpha = sign * top[0]->gpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+        // the pair part
+        caffe_gpu_axpby(
+            dim,
+            alpha,
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_triplets)*j + i)*dim);
+        // the num_triplets triplet part
+        for (int triplet = 0; triplet < num_triplets; ++triplet) {
+          dist_sq_.mutable_gpu_data()[j] = dist_sq_pos.mutable_gpu_data()[j];
+          dist_sq_.mutable_gpu_data()[j] += margin;
+          // Loss component calculated from negative part
+          caffe_gpu_sub(
+              dim,
+              bottom[0]->gpu_data() + (2 + num_triplets)*j*dim,  // reference
+              bottom[0]->gpu_data() + ((2 + num_triplets)*j + 2 + triplet)*dim,
+              diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+          caffe_gpu_dot(
+              dim,
+              diff_neg.gpu_data() + j*dim,
+              diff_neg.gpu_data() + j*dim,
+              dist_sq_neg.mutable_cpu_data() + j);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_gpu_data()[j] = 1 - \
+            dist_sq_neg.gpu_data()[j] / dist_sq_.mutable_gpu_data()[j];
+          // loss accumulated accumulated by the triplet part
+          if ((dist_sq_.gpu_data()[j]) > Dtype(0.0)) {
+            caffe_gpu_axpby(
+                dim,
+                alpha*dist_sq_neg.mutable_gpu_data()[j]\
+  /((dist_sq_pos.mutable_gpu_data()[j]+margin)\
+  *(dist_sq_pos.mutable_gpu_data()[j]+margin)),
+                diff_pos.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_triplets)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  for (int i = 2; i < 2 + num_triplets; ++i) {
+    // BP for negative data(feat3)
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->gpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+        dist_sq_.mutable_gpu_data()[j] = dist_sq_pos.mutable_gpu_data()[j];
+        dist_sq_.mutable_gpu_data()[j] += margin;
+        // Loss component calculated from negative part
+        caffe_gpu_sub(
+            dim,
+            bottom[0]->gpu_data() + (2 + num_triplets)*j*dim,  // reference
+            bottom[0]->gpu_data() + ((2 + num_triplets)*j + i)*dim,
+            diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+        caffe_gpu_dot(
+            dim,
+            diff_neg.gpu_data() + j*dim,
+            diff_neg.gpu_data() + j*dim,
+            dist_sq_neg.mutable_cpu_data() + j);
+        // a and negative[triplet] is a dissimilar pair for triplet
+        dist_sq_.mutable_gpu_data()[j] = 1 - \
+          dist_sq_neg.gpu_data()[j] / dist_sq_.mutable_gpu_data()[j];
+        // loss accumulated accumulated by the triplet part
+        if ((dist_sq_.gpu_data()[j]) > Dtype(0.0)) {
+          caffe_gpu_axpby(
+              dim,
+              alpha/(dist_sq_pos.mutable_gpu_data()[j] + margin),
+              diff_neg.gpu_data() + (j*dim),
+              Dtype(0.0),
+              bout + ((2 + num_triplets)*j + i)*dim);
+        } else {
+            caffe_gpu_set(dim, Dtype(0), bout + ((2 + num_triplets)*j + i)*dim);
+        }
+      }
+    }
+  }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(TripletLossLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index f8db61c9258..f637f2ec6d4 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_OPENCV
 #include <opencv2/highgui/highgui_c.h>
 #include <stdint.h>
 
@@ -469,4 +468,3 @@ INSTANTIATE_CLASS(WindowDataLayer);
 REGISTER_LAYER_CLASS(WindowData);
 
 }  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index ebb8b5d28c2..89d14013dc9 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -810,11 +810,12 @@ void Net<Dtype>::Backward() {
   BackwardFromTo(layers_.size() - 1, 0);
   if (debug_info_) {
     Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
-    for (int i = 0; i < learnable_params_.size(); ++i) {
-      asum_data += learnable_params_[i]->asum_data();
-      asum_diff += learnable_params_[i]->asum_diff();
-      sumsq_data += learnable_params_[i]->sumsq_data();
-      sumsq_diff += learnable_params_[i]->sumsq_diff();
+    for (int i = 0; i < params_.size(); ++i) {
+      if (param_owners_[i] >= 0) { continue; }
+      asum_data += params_[i]->asum_data();
+      asum_diff += params_[i]->asum_diff();
+      sumsq_data += params_[i]->sumsq_data();
+      sumsq_diff += params_[i]->sumsq_diff();
     }
     const Dtype l2norm_data = std::sqrt(sumsq_data);
     const Dtype l2norm_diff = std::sqrt(sumsq_diff);
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index f52c941b05e..331d31ce8f8 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -385,6 +385,7 @@ message LayerParameter {
   optional ThresholdParameter threshold_param = 128;
   optional TileParameter tile_param = 138;
   optional WindowDataParameter window_data_param = 129;
+  optional TripletLossParameter triplet_loss_param = 139;
 }
 
 // Message that stores parameters used to apply transformation
@@ -468,27 +469,28 @@ message ContrastiveLossParameter {
   optional bool legacy_version = 2 [default = false];
 }
 
+message TripletLossParameter {
+  //margin for negative triplet
+  optional float margin = 1 [default = 1.0];
+  optional uint32 losstype = 2 [default = 1];
+  optional uint32 num_triplets = 3 [default = 3];
+}
+
 message ConvolutionParameter {
   optional uint32 num_output = 1; // The number of outputs for the layer
   optional bool bias_term = 2 [default = true]; // whether to have bias terms
-
   // Pad, kernel size, and stride are all given as a single value for equal
-  // dimensions in all spatial dimensions, or once per spatial dimension.
-  repeated uint32 pad = 3; // The padding size; defaults to 0
-  repeated uint32 kernel_size = 4; // The kernel size
-  repeated uint32 stride = 6; // The stride; defaults to 1
-
-  // For 2D convolution only, the *_h and *_w versions may also be used to
-  // specify both spatial dimensions.
-  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
-  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
-  optional uint32 kernel_h = 11; // The kernel height (2D only)
-  optional uint32 kernel_w = 12; // The kernel width (2D only)
-  optional uint32 stride_h = 13; // The stride height (2D only)
-  optional uint32 stride_w = 14; // The stride width (2D only)
-
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 3 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 4; // The kernel size (square)
+  optional uint32 kernel_h = 11; // The kernel height
+  optional uint32 kernel_w = 12; // The kernel width
   optional uint32 group = 5 [default = 1]; // The group size for group conv
-
+  optional uint32 stride = 6 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 13; // The stride height
+  optional uint32 stride_w = 14; // The stride width
   optional FillerParameter weight_filler = 7; // The filler for the weight
   optional FillerParameter bias_filler = 8; // The filler for the bias
   enum Engine {
@@ -497,24 +499,6 @@ message ConvolutionParameter {
     CUDNN = 2;
   }
   optional Engine engine = 15 [default = DEFAULT];
-
-  // The axis to interpret as "channels" when performing convolution.
-  // Preceding dimensions are treated as independent inputs;
-  // succeeding dimensions are treated as "spatial".
-  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
-  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
-  // groups g>1) filters across the spatial axes (H, W) of the input.
-  // With (N, C, D, H, W) inputs, and axis == 1, we perform
-  // N independent 3D convolutions, sliding (C/g)-channels
-  // filters across the spatial axes (D, H, W) of the input.
-  optional int32 axis = 16 [default = 1];
-
-  // Whether to force use of the general ND convolution, even if a specific
-  // implementation for blobs of the appropriate number of spatial dimensions
-  // is available. (Currently, there is only a 2D-specific convolution
-  // implementation; for input blobs with num_axes != 2, this option is
-  // ignored and the ND implementation will be used.)
-  optional bool force_nd_im2col = 17 [default = false];
 }
 
 message DataParameter {
@@ -1053,8 +1037,10 @@ message V1LayerParameter {
     SPLIT = 22;
     SLICE = 33;
     TANH = 23;
+    TRIPLET_LOSS = 40;
     WINDOW_DATA = 24;
     THRESHOLD = 31;
+    
   }
   optional LayerType type = 5;
   repeated BlobProto blobs = 6;
@@ -1098,6 +1084,7 @@ message V1LayerParameter {
   optional TransformationParameter transform_param = 36;
   optional LossParameter loss_param = 42;
   optional V0LayerParameter layer = 1;
+  optional TripletLossParameter triplet_loss_param = 43;
 }
 
 // DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 12c13dd8385..394ec3b3ad7 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -55,7 +55,6 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
     << std::endl << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
-  CheckSnapshotWritePermissions();
   if (Caffe::root_solver() && param_.random_seed() >= 0) {
     Caffe::set_random_seed(param_.random_seed());
   }
@@ -422,37 +421,19 @@ void Solver<Dtype>::Snapshot() {
   CHECK(Caffe::root_solver());
   string model_filename;
   switch (param_.snapshot_format()) {
-  case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
-    model_filename = SnapshotToBinaryProto();
-    break;
-  case caffe::SolverParameter_SnapshotFormat_HDF5:
-    model_filename = SnapshotToHDF5();
-    break;
-  default:
-    LOG(FATAL) << "Unsupported snapshot format.";
+    case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
+      model_filename = SnapshotToBinaryProto();
+      break;
+    case caffe::SolverParameter_SnapshotFormat_HDF5:
+      model_filename = SnapshotToHDF5();
+      break;
+    default:
+      LOG(FATAL) << "Unsupported snapshot format.";
   }
 
   SnapshotSolverState(model_filename);
 }
 
-template <typename Dtype>
-void Solver<Dtype>::CheckSnapshotWritePermissions() {
-  if (Caffe::root_solver() && param_.snapshot()) {
-    CHECK(param_.has_snapshot_prefix())
-        << "In solver params, snapshot is specified but snapshot_prefix is not";
-    string probe_filename = SnapshotFilename(".tempfile");
-    std::ofstream probe_ofs(probe_filename.c_str());
-    if (probe_ofs.good()) {
-      probe_ofs.close();
-      std::remove(probe_filename.c_str());
-    } else {
-      LOG(FATAL) << "Cannot write to snapshot prefix '"
-          << param_.snapshot_prefix() << "'.  Make sure "
-          << "that the directory exists and is writeable.";
-    }
-  }
-}
-
 template <typename Dtype>
 string Solver<Dtype>::SnapshotFilename(const string extension) {
   string filename(param_.snapshot_prefix());
@@ -751,7 +732,7 @@ void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto(
   }
   string snapshot_filename = Solver<Dtype>::SnapshotFilename(".solverstate");
   LOG(INFO)
-    << "Snapshotting solver state to binary proto file " << snapshot_filename;
+    << "Snapshotting solver state to binary proto file" << snapshot_filename;
   WriteProtoToBinaryFile(state, snapshot_filename.c_str());
 }
 
diff --git a/src/caffe/test/test_accuracy_layer.cpp b/src/caffe/test/test_accuracy_layer.cpp
index ef0e57a37a1..94e529b5eee 100644
--- a/src/caffe/test/test_accuracy_layer.cpp
+++ b/src/caffe/test/test_accuracy_layer.cpp
@@ -250,6 +250,7 @@ TYPED_TEST(AccuracyLayerTest, TestForwardCPUTopK) {
 
 TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClass) {
   LayerParameter layer_param;
+  Caffe::set_mode(Caffe::CPU);
   AccuracyLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
@@ -278,16 +279,16 @@ TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClass) {
   EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
               num_correct_labels / 100.0, 1e-4);
   for (int i = 0; i < num_class; ++i) {
-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
     EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                accuracy_per_class, 1e-4);
+                static_cast<float>(correct_per_class[i]) / num_per_class[i],
+                1e-4);
   }
 }
 
 
 TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClassWithIgnoreLabel) {
   LayerParameter layer_param;
+  Caffe::set_mode(Caffe::CPU);
   const TypeParam kIgnoreLabelValue = -1;
   layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
   AccuracyLayer<TypeParam> layer(layer_param);
@@ -328,10 +329,9 @@ TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClassWithIgnoreLabel) {
   EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
               num_correct_labels / TypeParam(count), 1e-4);
   for (int i = 0; i < 10; ++i) {
-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
     EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                accuracy_per_class, 1e-4);
+                TypeParam(correct_per_class[i]) / num_per_class[i],
+                1e-4);
   }
 }
 
diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp
index ccd97eb1d66..088e0a41685 100644
--- a/src/caffe/test/test_concat_layer.cpp
+++ b/src/caffe/test/test_concat_layer.cpp
@@ -99,19 +99,6 @@ TYPED_TEST(ConcatLayerTest, TestSetupChannelsNegativeIndexing) {
   EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
 }
 
-TYPED_TEST(ConcatLayerTest, TestForwardTrivial) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConcatLayer<Dtype> layer(layer_param);
-  this->blob_bottom_vec_0_.resize(1);
-  layer.SetUp(this->blob_bottom_vec_0_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_0_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_bottom_0_->count(); ++i) {
-    EXPECT_EQ(this->blob_bottom_0_->cpu_data()[i],
-              this->blob_top_->cpu_data()[i]);
-  }
-}
-
 TYPED_TEST(ConcatLayerTest, TestForwardNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -167,16 +154,6 @@ TYPED_TEST(ConcatLayerTest, TestForwardChannels) {
   }
 }
 
-TYPED_TEST(ConcatLayerTest, TestGradientTrivial) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConcatLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  this->blob_bottom_vec_0_.resize(1);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_0_,
-      this->blob_top_vec_);
-}
-
 TYPED_TEST(ConcatLayerTest, TestGradientNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 9df979a2d27..67d41fff844 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -19,87 +19,54 @@ template <typename Dtype>
 void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
     const vector<shared_ptr<Blob<Dtype> > >& weights,
     Blob<Dtype>* out) {
-  const bool has_depth = (out->num_axes() == 5);
-  if (!has_depth) { CHECK_EQ(4, out->num_axes()); }
   // Kernel size, stride, and pad
   int kernel_h, kernel_w;
-  if (conv_param->has_kernel_h() || conv_param->has_kernel_w()) {
+  if (conv_param->has_kernel_size()) {
+    kernel_h = kernel_w = conv_param->kernel_size();
+  } else {
     kernel_h = conv_param->kernel_h();
     kernel_w = conv_param->kernel_w();
-  } else {
-    kernel_h = kernel_w = conv_param->kernel_size(0);
   }
   int pad_h, pad_w;
-  if (conv_param->has_pad_h() || conv_param->has_pad_w()) {
+  if (!conv_param->has_pad_h()) {
+    pad_h = pad_w = conv_param->pad();
+  } else {
     pad_h = conv_param->pad_h();
     pad_w = conv_param->pad_w();
-  } else {
-    pad_h = pad_w = conv_param->pad_size() ? conv_param->pad(0) : 0;
   }
   int stride_h, stride_w;
-  if (conv_param->has_stride_h() || conv_param->has_stride_w()) {
+  if (!conv_param->has_stride_h()) {
+    stride_h = stride_w = conv_param->stride();
+  } else {
     stride_h = conv_param->stride_h();
     stride_w = conv_param->stride_w();
-  } else {
-    stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1;
-  }
-  int kernel_d, pad_d, stride_d;
-  if (has_depth) {
-    kernel_d = kernel_h;
-    stride_d = stride_h;
-    pad_d = pad_h;
-  } else {
-    kernel_d = stride_d = 1;
-    pad_d = 0;
   }
   // Groups
   int groups = conv_param->group();
-  int o_g = out->shape(1) / groups;
-  int k_g = in->shape(1) / groups;
+  int o_g = out->channels() / groups;
+  int k_g = in->channels() / groups;
   int o_head, k_head;
   // Convolution
-  vector<int> weight_offset(4 + has_depth);
-  vector<int> in_offset(4 + has_depth);
-  vector<int> out_offset(4 + has_depth);
+  const Dtype* in_data = in->cpu_data();
+  const Dtype* weight_data = weights[0]->cpu_data();
   Dtype* out_data = out->mutable_cpu_data();
-  for (int n = 0; n < out->shape(0); n++) {
+  for (int n = 0; n < out->num(); n++) {
     for (int g = 0; g < groups; g++) {
       o_head = o_g * g;
       k_head = k_g * g;
       for (int o = 0; o < o_g; o++) {
         for (int k = 0; k < k_g; k++) {
-          for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
-            for (int y = 0; y < out->shape(2 + has_depth); y++) {
-              for (int x = 0; x < out->shape(3 + has_depth); x++) {
-                for (int r = 0; r < kernel_d; r++) {
-                  for (int p = 0; p < kernel_h; p++) {
-                    for (int q = 0; q < kernel_w; q++) {
-                      int in_z = z * stride_d - pad_d + r;
-                      int in_y = y * stride_h - pad_h + p;
-                      int in_x = x * stride_w - pad_w + q;
-                      if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1)
-                          && in_y >= 0 && in_y < in->shape(2 + has_depth)
-                          && in_x >= 0 && in_x < in->shape(3 + has_depth)) {
-                        weight_offset[0] = o + o_head;
-                        weight_offset[1] = k;
-                        if (has_depth) { weight_offset[2] = r; }
-                        weight_offset[2 + has_depth] = p;
-                        weight_offset[3 + has_depth] = q;
-                        in_offset[0] = n;
-                        in_offset[1] = k + k_head;
-                        if (has_depth) { in_offset[2] = in_z; }
-                        in_offset[2 + has_depth] = in_y;
-                        in_offset[3 + has_depth] = in_x;
-                        out_offset[0] = n;
-                        out_offset[1] = o + o_head;
-                        if (has_depth) { out_offset[2] = z; }
-                        out_offset[2 + has_depth] = y;
-                        out_offset[3 + has_depth] = x;
-                        out_data[out->offset(out_offset)] +=
-                            in->data_at(in_offset)
-                            * weights[0]->data_at(weight_offset);
-                      }
-                    }
+          for (int y = 0; y < out->height(); y++) {
+            for (int x = 0; x < out->width(); x++) {
+              for (int p = 0; p < kernel_h; p++) {
+                for (int q = 0; q < kernel_w; q++) {
+                  int in_y = y * stride_h - pad_h + p;
+                  int in_x = x * stride_w - pad_w + q;
+                  if (in_y >= 0 && in_y < in->height()
+                    && in_x >= 0 && in_x < in->width()) {
+                    out_data[out->offset(n, o + o_head, y, x)] +=
+                        in_data[in->offset(n, k + k_head, in_y, in_x)]
+                        * weight_data[weights[0]->offset(o + o_head, k, p, q)];
                   }
                 }
               }
@@ -112,18 +79,11 @@ void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
   // Bias
   if (conv_param->bias_term()) {
     const Dtype* bias_data = weights[1]->cpu_data();
-    for (int n = 0; n < out->shape(0); n++) {
-      for (int o = 0; o < out->shape(1); o++) {
-        for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
-          for (int y = 0; y < out->shape(2 + has_depth); y++) {
-            for (int x = 0; x < out->shape(3 + has_depth); x++) {
-              out_offset[0] = n;
-              out_offset[1] = o;
-              if (has_depth) { out_offset[2] = z; }
-              out_offset[2 + has_depth] = y;
-              out_offset[3 + has_depth] = x;
-              out_data[out->offset(out_offset)] += bias_data[o];
-            }
+    for (int n = 0; n < out->num(); n++) {
+      for (int o = 0; o < out->channels(); o++) {
+        for (int y = 0; y < out->height(); y++) {
+          for (int x = 0; x < out->width(); x++) {
+            out_data[out->offset(n, o, y, x)] += bias_data[o];
           }
         }
       }
@@ -190,8 +150,8 @@ TYPED_TEST(ConvolutionLayerTest, TestSetup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(4);
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
@@ -228,8 +188,8 @@ TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(4);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("constant");
@@ -257,98 +217,13 @@ TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
   }
 }
 
-TYPED_TEST(ConvolutionLayerTest, Test0DConvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  const int kNumOutput = 3;
-  convolution_param->set_num_output(kNumOutput);
-  convolution_param->set_axis(3);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  vector<int> top_shape = this->blob_bottom_->shape();
-  top_shape[3] = kNumOutput;
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(top_shape, this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  vector<int> weight_offset(2);
-  const Blob<Dtype>* weight = layer->blobs()[0].get();
-  const Blob<Dtype>* bias = layer->blobs()[1].get();
-  const int num = this->blob_top_->count(3);
-  const int dim = this->blob_top_->shape(3);
-  const int bottom_dim = this->blob_bottom_->shape(3);
-  for (int n = 0; n < num; ++n) {
-    for (int d = 0; d < dim; ++d) {
-      weight_offset[0] = d;
-      Dtype value = bias->cpu_data()[d];
-      for (int bottom_d = 0; bottom_d < bottom_dim; ++bottom_d) {
-        weight_offset[1] = bottom_d;
-        value += weight->data_at(weight_offset) *
-                 this->blob_bottom_->cpu_data()[n * bottom_dim + bottom_d];
-      }
-      EXPECT_NEAR(value, this->blob_top_->cpu_data()[n * dim + d], 1e-4);
-    }
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestSimple3DConvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 5;
-  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
-  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
 TYPED_TEST(ConvolutionLayerTest, Test1x1Convolution) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(1);
-  convolution_param->add_stride(1);
+  convolution_param->set_kernel_size(1);
+  convolution_param->set_stride(1);
   convolution_param->set_num_output(4);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("constant");
@@ -374,8 +249,8 @@ TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolutionGroup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(3);
   convolution_param->set_group(3);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
@@ -413,8 +288,8 @@ TYPED_TEST(ConvolutionLayerTest, TestSobelConvolution) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(1);
   convolution_param->set_bias_term(false);
   shared_ptr<Layer<Dtype> > layer(
@@ -475,11 +350,14 @@ TYPED_TEST(ConvolutionLayerTest, TestSobelConvolution) {
   convolution_param->set_bias_term(false);
   layer.reset(new ConvolutionLayer<Dtype>(layer_param));
   layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<Dtype>(1, 1, 1, 3));
+  layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 1, 3));
   Dtype* weights_2 = layer->blobs()[0]->mutable_cpu_data();
-  weights_2[0] = -1;
-  weights_2[1] =  0;
-  weights_2[2] =  1;
+  for (int c = 0; c < 3; ++c) {
+    int i = c * 3;  // 1 x 3 filter
+    weights_2[i +  0] = -1;
+    weights_2[i +  1] =  0;
+    weights_2[i +  2] =  1;
+  }
   layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
   layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
   // Test equivalence of full and separable filters.
@@ -490,124 +368,6 @@ TYPED_TEST(ConvolutionLayerTest, TestSobelConvolution) {
   }
 }
 
-TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) {
-  typedef typename TypeParam::Dtype Dtype;
-  const int kernel_h = 11;
-  const int kernel_w = 13;
-  vector<int> bottom_shape(4);
-  bottom_shape[0] = 15;
-  bottom_shape[1] = 18;
-  bottom_shape[2] = kernel_h * 2;
-  bottom_shape[3] = kernel_w * 2;
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->set_num_output(12);
-  convolution_param->set_bias_term(false);
-  convolution_param->set_group(6);
-  convolution_param->set_kernel_h(kernel_h);
-  convolution_param->set_kernel_w(kernel_w);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  Blob<Dtype> weights;
-  Blob<Dtype> top_diff;
-  // Shape and fill weights and top_diff.
-  bool copy_diff;
-  bool reshape;
-  {
-    ConvolutionLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    top_diff.ReshapeLike(*this->blob_top_);
-    filler.Fill(&top_diff);
-    ASSERT_EQ(1, layer.blobs().size());
-    copy_diff = false; reshape = true;
-    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
-  }
-  vector<bool> propagate_down(1, true);
-  Blob<Dtype> result_2d;
-  Blob<Dtype> backward_result_2d;
-  Blob<Dtype> backward_weight_result_2d;
-  // Test with 2D im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_2d.
-    convolution_param->set_force_nd_im2col(false);
-    ConvolutionLayer<Dtype> layer_2d(layer_param);
-    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_2d.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_2d.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_2d.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
-  }
-  Blob<Dtype> result_nd;
-  Blob<Dtype> backward_result_nd;
-  Blob<Dtype> backward_weight_result_nd;
-  // Test with ND im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_nd.
-    convolution_param->set_force_nd_im2col(true);
-    ConvolutionLayer<Dtype> layer_nd(layer_param);
-    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_nd.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_nd.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_nd.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
-  }
-  ASSERT_EQ(result_nd.count(), result_2d.count());
-  for (int i = 0; i < result_2d.count(); ++i)  {
-    EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]);
-  }
-  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
-  for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
-              backward_result_nd.cpu_diff()[i]);
-  }
-  ASSERT_EQ(backward_weight_result_nd.count(),
-            backward_weight_result_2d.count());
-  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
-              backward_weight_result_nd.cpu_diff()[i]);
-  }
-}
-
 TYPED_TEST(ConvolutionLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -615,36 +375,8 @@ TYPED_TEST(ConvolutionLayerTest, TestGradient) {
       layer_param.mutable_convolution_param();
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  ConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestGradient3D) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 5;
-  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
-  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(2);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
@@ -661,8 +393,8 @@ TYPED_TEST(ConvolutionLayerTest, Test1x1Gradient) {
       layer_param.mutable_convolution_param();
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(1);
-  convolution_param->add_stride(1);
+  convolution_param->set_kernel_size(1);
+  convolution_param->set_stride(1);
   convolution_param->set_num_output(2);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
@@ -677,8 +409,8 @@ TYPED_TEST(ConvolutionLayerTest, TestGradientGroup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(3);
   convolution_param->set_group(3);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
@@ -740,8 +472,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSetupCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(4);
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
@@ -777,8 +509,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSimpleConvolutionCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(4);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("constant");
@@ -810,8 +542,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSimpleConvolutionGroupCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(3);
   convolution_param->set_group(3);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
@@ -849,8 +581,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(1);
   convolution_param->set_bias_term(false);
   shared_ptr<Layer<TypeParam> > layer(
@@ -911,11 +643,14 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) {
   convolution_param->set_bias_term(false);
   layer.reset(new CuDNNConvolutionLayer<TypeParam>(layer_param));
   layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<TypeParam>(1, 1, 1, 3));
+  layer->blobs()[0].reset(new Blob<TypeParam>(1, 3, 1, 3));
   TypeParam* weights_2 = layer->blobs()[0]->mutable_cpu_data();
-  weights_2[0] = -1;
-  weights_2[1] =  0;
-  weights_2[2] =  1;
+  for (int c = 0; c < 3; ++c) {
+    int i = c * 3;  // 1 x 3 filter
+    weights_2[i +  0] = -1;
+    weights_2[i +  1] =  0;
+    weights_2[i +  2] =  1;
+  }
   layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
   layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
   // Test equivalence of full and separable filters.
@@ -932,8 +667,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestGradientCuDNN) {
       layer_param.mutable_convolution_param();
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(2);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
@@ -947,8 +682,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestGradientGroupCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(3);
   convolution_param->set_group(3);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
diff --git a/src/caffe/test/test_data_layer.cpp b/src/caffe/test/test_data_layer.cpp
index 9e03954a543..afe2a40d227 100644
--- a/src/caffe/test/test_data_layer.cpp
+++ b/src/caffe/test/test_data_layer.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_OPENCV
 #include <string>
 #include <vector>
 
@@ -349,7 +348,6 @@ class DataLayerTest : public MultiDeviceTest<TypeParam> {
 
 TYPED_TEST_CASE(DataLayerTest, TestDtypesAndDevices);
 
-#ifdef USE_LEVELDB
 TYPED_TEST(DataLayerTest, TestReadLevelDB) {
   const bool unique_pixels = false;  // all pixels the same; images different
   this->Fill(unique_pixels, DataParameter_DB_LEVELDB);
@@ -387,9 +385,7 @@ TYPED_TEST(DataLayerTest, TestReadCropTestLevelDB) {
   this->Fill(unique_pixels, DataParameter_DB_LEVELDB);
   this->TestReadCrop(TEST);
 }
-#endif  // USE_LEVELDB
 
-#ifdef USE_LMDB
 TYPED_TEST(DataLayerTest, TestReadLMDB) {
   const bool unique_pixels = false;  // all pixels the same; images different
   this->Fill(unique_pixels, DataParameter_DB_LMDB);
@@ -428,6 +424,4 @@ TYPED_TEST(DataLayerTest, TestReadCropTestLMDB) {
   this->TestReadCrop(TEST);
 }
 
-#endif  // USE_LMDB
 }  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_data_transformer.cpp b/src/caffe/test/test_data_transformer.cpp
index 8a1013744e8..16570e20356 100644
--- a/src/caffe/test/test_data_transformer.cpp
+++ b/src/caffe/test/test_data_transformer.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_OPENCV
 #include <string>
 #include <vector>
 
@@ -354,4 +353,3 @@ TYPED_TEST(DataTransformTest, TestMeanFile) {
 }
 
 }  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_db.cpp b/src/caffe/test/test_db.cpp
index 1b487b14c58..5b2ac230a0b 100644
--- a/src/caffe/test/test_db.cpp
+++ b/src/caffe/test/test_db.cpp
@@ -1,4 +1,3 @@
-#if defined(USE_LEVELDB) && defined(USE_LMDB) && defined(USE_OPENCV)
 #include <string>
 
 #include "boost/scoped_ptr.hpp"
@@ -133,4 +132,3 @@ TYPED_TEST(DBTest, TestWrite) {
 }
 
 }  // namespace caffe
-#endif  // USE_LEVELDB, USE_LMDB and USE_OPENCV
diff --git a/src/caffe/test/test_deconvolution_layer.cpp b/src/caffe/test/test_deconvolution_layer.cpp
index 770e7b277ee..fc63d5efbe3 100644
--- a/src/caffe/test/test_deconvolution_layer.cpp
+++ b/src/caffe/test/test_deconvolution_layer.cpp
@@ -58,8 +58,8 @@ TYPED_TEST(DeconvolutionLayerTest, TestSetup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(4);
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
@@ -96,8 +96,8 @@ TYPED_TEST(DeconvolutionLayerTest, TestSimpleDeconvolution) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   convolution_param->set_num_output(4);
   convolution_param->mutable_weight_filler()->set_type("constant");
   convolution_param->mutable_weight_filler()->set_value(1);
@@ -144,8 +144,8 @@ TYPED_TEST(DeconvolutionLayerTest, TestGradient) {
       layer_param.mutable_convolution_param();
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(2);
-  convolution_param->add_stride(1);
+  convolution_param->set_kernel_size(2);
+  convolution_param->set_stride(1);
   convolution_param->set_num_output(1);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
@@ -155,151 +155,4 @@ TYPED_TEST(DeconvolutionLayerTest, TestGradient) {
       this->blob_top_vec_);
 }
 
-TYPED_TEST(DeconvolutionLayerTest, TestNDAgainst2D) {
-  typedef typename TypeParam::Dtype Dtype;
-  const int kernel_h = 11;
-  const int kernel_w = 13;
-  vector<int> bottom_shape(4);
-  bottom_shape[0] = 15;
-  bottom_shape[1] = 12;
-  bottom_shape[2] = kernel_h * 2;
-  bottom_shape[3] = kernel_w * 2;
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->set_num_output(18);
-  convolution_param->set_bias_term(false);
-  convolution_param->set_group(6);
-  convolution_param->set_kernel_h(kernel_h);
-  convolution_param->set_kernel_w(kernel_w);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  Blob<Dtype> weights;
-  Blob<Dtype> top_diff;
-  // Shape and fill weights and top_diff.
-  bool copy_diff;
-  bool reshape;
-  {
-    DeconvolutionLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    top_diff.ReshapeLike(*this->blob_top_);
-    filler.Fill(&top_diff);
-    ASSERT_EQ(1, layer.blobs().size());
-    copy_diff = false; reshape = true;
-    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
-  }
-  vector<bool> propagate_down(1, true);
-  Blob<Dtype> result_2d;
-  Blob<Dtype> backward_result_2d;
-  Blob<Dtype> backward_weight_result_2d;
-  // Test with 2D im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_2d.
-    convolution_param->set_force_nd_im2col(false);
-    DeconvolutionLayer<Dtype> layer_2d(layer_param);
-    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_2d.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_2d.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_2d.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
-  }
-  Blob<Dtype> result_nd;
-  Blob<Dtype> backward_result_nd;
-  Blob<Dtype> backward_weight_result_nd;
-  // Test with ND im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_nd.
-    convolution_param->set_force_nd_im2col(true);
-    DeconvolutionLayer<Dtype> layer_nd(layer_param);
-    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_nd.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_nd.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_nd.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
-  }
-  ASSERT_EQ(result_nd.count(), result_2d.count());
-  for (int i = 0; i < result_2d.count(); ++i)  {
-    EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]);
-  }
-  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
-  for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
-              backward_result_nd.cpu_diff()[i]);
-  }
-  ASSERT_EQ(backward_weight_result_nd.count(),
-            backward_weight_result_2d.count());
-  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
-              backward_weight_result_nd.cpu_diff()[i]);
-  }
-}
-
-TYPED_TEST(DeconvolutionLayerTest, TestGradient3D) {
-  typedef typename TypeParam::Dtype Dtype;
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 2;
-  bottom_shape[3] = 3;
-  bottom_shape[4] = 2;
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(2);
-  convolution_param->add_stride(2);
-  convolution_param->add_pad(1);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  DeconvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
 }  // namespace caffe
diff --git a/src/caffe/test/test_eltwise_layer.cpp b/src/caffe/test/test_eltwise_layer.cpp
index 8031f6e9022..be0c1347709 100644
--- a/src/caffe/test/test_eltwise_layer.cpp
+++ b/src/caffe/test/test_eltwise_layer.cpp
@@ -80,7 +80,7 @@ TYPED_TEST(EltwiseLayerTest, TestProd) {
   const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
   const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
   for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i] * in_data_c[i], 1e-4);
+    EXPECT_EQ(data[i], in_data_a[i] * in_data_b[i] * in_data_c[i]);
   }
 }
 
@@ -99,7 +99,7 @@ TYPED_TEST(EltwiseLayerTest, TestSum) {
   const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
   const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
   for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i] + in_data_c[i], 1e-4);
+    EXPECT_EQ(data[i], in_data_a[i] + in_data_b[i] + in_data_c[i]);
   }
 }
 
diff --git a/src/caffe/test/test_im2col_kernel.cu b/src/caffe/test/test_im2col_kernel.cu
index f0b75fcc68d..0017ac23e69 100644
--- a/src/caffe/test/test_im2col_kernel.cu
+++ b/src/caffe/test/test_im2col_kernel.cu
@@ -22,12 +22,6 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
     const int height_col, const int width_col,
     Dtype* data_col);
 
-template <typename Dtype, int num_axes>
-__global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col);
-
 extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
 
 template <typename Dtype>
@@ -36,18 +30,11 @@ class Im2colKernelTest : public GPUDeviceTest<Dtype> {
   Im2colKernelTest()
         // big so launches > 1024 threads
       : blob_bottom_(new Blob<Dtype>(5, 500, 10, 10)),
-        blob_kernel_shape_(new Blob<int>()),
-        blob_stride_(new Blob<int>()),
-        blob_pad_(new Blob<int>()),
         blob_top_(new Blob<Dtype>()),
         blob_top_cpu_(new Blob<Dtype>()) {
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_);
-    vector<int> dim_blob_shape(1, 2);
-    blob_kernel_shape_->Reshape(dim_blob_shape);
-    blob_stride_->Reshape(dim_blob_shape);
-    blob_pad_->Reshape(dim_blob_shape);
 
     height_ = blob_bottom_->height();
     width_ = blob_bottom_->width();
@@ -57,26 +44,14 @@ class Im2colKernelTest : public GPUDeviceTest<Dtype> {
     kernel_size_ = 3;
     height_col_ = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
     width_col_ = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
-
-    for (int i = 0; i < 2; ++i) {
-      blob_kernel_shape_->mutable_cpu_data()[i] = kernel_size_;
-      blob_stride_->mutable_cpu_data()[i] = stride_;
-      blob_pad_->mutable_cpu_data()[i] = pad_;
-    }
   }
 
   virtual ~Im2colKernelTest() {
-    delete blob_bottom_;
-    delete blob_top_;
-    delete blob_top_cpu_;
-    delete blob_kernel_shape_;
-    delete blob_stride_;
-    delete blob_pad_;
+      delete blob_bottom_;
+      delete blob_top_;
+      delete blob_top_cpu_;
   }
 
-  Blob<int>* const blob_kernel_shape_;
-  Blob<int>* const blob_stride_;
-  Blob<int>* const blob_pad_;
   Blob<Dtype>* const blob_bottom_;
   Blob<Dtype>* const blob_top_;
   Blob<Dtype>* const blob_top_cpu_;
@@ -92,7 +67,7 @@ class Im2colKernelTest : public GPUDeviceTest<Dtype> {
 
 TYPED_TEST_CASE(Im2colKernelTest, TestDtypes);
 
-TYPED_TEST(Im2colKernelTest, Test2D) {
+TYPED_TEST(Im2colKernelTest, TestGPU) {
   // Reshape the blobs to correct size for im2col output
   this->blob_top_->Reshape(this->blob_bottom_->num(),
           this->channels_ * this->kernel_size_ * this->kernel_size_,
@@ -147,58 +122,4 @@ TYPED_TEST(Im2colKernelTest, Test2D) {
   }
 }
 
-TYPED_TEST(Im2colKernelTest, TestND) {
-  // Reshape the blobs to correct size for im2col output
-  this->blob_top_->Reshape(this->blob_bottom_->num(),
-      this->channels_ * this->kernel_size_ * this->kernel_size_,
-      this->height_col_,
-      this->width_col_);
-
-  this->blob_top_cpu_->ReshapeLike(*this->blob_top_);
-
-  const TypeParam* bottom_data_cpu = this->blob_bottom_->cpu_data();
-  TypeParam* top_data_cpu = this->blob_top_cpu_->mutable_cpu_data();
-
-  // CPU Version
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    im2col_nd_cpu(bottom_data_cpu + this->blob_bottom_->offset(n), 2,
-        this->blob_bottom_->shape().data() + 1,
-        this->blob_top_cpu_->shape().data() + 1,
-        this->blob_kernel_shape_->cpu_data(),
-        this->blob_pad_->cpu_data(), this->blob_stride_->cpu_data(),
-        top_data_cpu + this->blob_top_cpu_->offset(n));
-  }
-
-  // GPU version
-  int num_kernels = this->channels_ * this->height_col_ * this->width_col_;
-  int default_grid_dim = CAFFE_GET_BLOCKS(num_kernels);
-  const TypeParam* bottom_data_gpu = this->blob_bottom_->gpu_data();
-
-  // Launch with different grid sizes
-  for (int grid_div = 2; grid_div <= 8; grid_div++) {
-    for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-      const int grid_dim = default_grid_dim / grid_div;
-      TypeParam* top_data_gpu = this->blob_top_->mutable_gpu_data();
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      im2col_nd_gpu_kernel<TypeParam, 2><<<grid_dim, CAFFE_CUDA_NUM_THREADS>>>(
-          num_kernels, bottom_data_gpu + this->blob_bottom_->offset(n),
-          this->blob_bottom_->gpu_shape() + 1, this->blob_top_->gpu_shape() + 1,
-          this->blob_kernel_shape_->gpu_data(), this->blob_pad_->gpu_data(),
-          this->blob_stride_->gpu_data(),
-          top_data_gpu + this->blob_top_->offset(n));
-      CUDA_POST_KERNEL_CHECK;
-    }
-
-    // Compare results against CPU version
-    for (int i = 0; i < this->blob_top_->count(); ++i) {
-      TypeParam cpuval = top_data_cpu[i];
-      TypeParam gpuval = this->blob_top_->cpu_data()[i];
-      EXPECT_EQ(cpuval, gpuval);
-      if (cpuval != gpuval) {
-        break;
-      }
-    }
-  }
-}
-
 }  // namespace caffe
diff --git a/src/caffe/test/test_im2col_layer.cpp b/src/caffe/test/test_im2col_layer.cpp
index 293aa262059..f50abe103f8 100644
--- a/src/caffe/test/test_im2col_layer.cpp
+++ b/src/caffe/test/test_im2col_layer.cpp
@@ -21,7 +21,6 @@ class Im2colLayerTest : public MultiDeviceTest<TypeParam> {
       : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
         blob_top_(new Blob<Dtype>()) {
     // fill the values
-    Caffe::set_random_seed(1701);
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_);
@@ -42,8 +41,8 @@ TYPED_TEST(Im2colLayerTest, TestSetup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), 2);
@@ -57,8 +56,8 @@ TYPED_TEST(Im2colLayerTest, TestForward) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -74,27 +73,14 @@ TYPED_TEST(Im2colLayerTest, TestGradient) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_kernel_size(3);
+  convolution_param->set_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-2);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_);
 }
 
-TYPED_TEST(Im2colLayerTest, TestGradientForceND) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_force_nd_im2col(true);
-  Im2colLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
 
 TYPED_TEST(Im2colLayerTest, TestRect) {
   typedef typename TypeParam::Dtype Dtype;
@@ -103,7 +89,7 @@ TYPED_TEST(Im2colLayerTest, TestRect) {
       layer_param.mutable_convolution_param();
   convolution_param->set_kernel_h(5);
   convolution_param->set_kernel_w(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -122,7 +108,7 @@ TYPED_TEST(Im2colLayerTest, TestRectGradient) {
       layer_param.mutable_convolution_param();
   convolution_param->set_kernel_h(5);
   convolution_param->set_kernel_w(3);
-  convolution_param->add_stride(2);
+  convolution_param->set_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-2);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
diff --git a/src/caffe/test/test_image_data_layer.cpp b/src/caffe/test/test_image_data_layer.cpp
index 481fcef7b27..931a5ebf137 100644
--- a/src/caffe/test/test_image_data_layer.cpp
+++ b/src/caffe/test/test_image_data_layer.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_OPENCV
 #include <map>
 #include <string>
 #include <vector>
@@ -178,4 +177,3 @@ TYPED_TEST(ImageDataLayerTest, TestShuffle) {
 }
 
 }  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_io.cpp b/src/caffe/test/test_io.cpp
index c2c919e90dc..4ab96311bbc 100644
--- a/src/caffe/test/test_io.cpp
+++ b/src/caffe/test/test_io.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/highgui/highgui_c.h>
@@ -421,4 +420,3 @@ TEST_F(IOTest, TestDecodeDatumToCVMatContentNative) {
 }
 
 }  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_layer_factory.cpp b/src/caffe/test/test_layer_factory.cpp
index 7d5d39d8b91..c86fafd000c 100644
--- a/src/caffe/test/test_layer_factory.cpp
+++ b/src/caffe/test/test_layer_factory.cpp
@@ -31,16 +31,12 @@ TYPED_TEST(LayerFactoryTest, TestCreateLayer) {
     LayerParameter layer_param;
     // Data layers expect a DB
     if (iter->first == "Data") {
-#ifdef USE_LEVELDB
       string tmp;
       MakeTempDir(&tmp);
       boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
       db->Open(tmp, db::NEW);
       db->Close();
       layer_param.mutable_data_param()->set_source(tmp);
-#else
-      continue;
-#endif  // USE_LEVELDB
     }
     layer_param.set_type(iter->first);
     layer = LayerRegistry<Dtype>::CreateLayer(layer_param);
diff --git a/src/caffe/test/test_memory_data_layer.cpp b/src/caffe/test/test_memory_data_layer.cpp
index 7269a4d441b..a79033f59f1 100644
--- a/src/caffe/test/test_memory_data_layer.cpp
+++ b/src/caffe/test/test_memory_data_layer.cpp
@@ -1,6 +1,4 @@
-#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
-#endif  // USE_OPENCV
 
 #include <string>
 #include <vector>
@@ -115,7 +113,6 @@ TYPED_TEST(MemoryDataLayerTest, TestForward) {
   }
 }
 
-#ifdef USE_OPENCV
 TYPED_TEST(MemoryDataLayerTest, AddDatumVectorDefaultTransform) {
   typedef typename TypeParam::Dtype Dtype;
 
@@ -295,5 +292,5 @@ TYPED_TEST(MemoryDataLayerTest, TestSetBatchSize) {
     }
   }
 }
-#endif  // USE_OPENCV
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_slice_layer.cpp b/src/caffe/test/test_slice_layer.cpp
index 2d2d0fdc005..ccd03646d19 100644
--- a/src/caffe/test/test_slice_layer.cpp
+++ b/src/caffe/test/test_slice_layer.cpp
@@ -88,21 +88,6 @@ TYPED_TEST(SliceLayerTest, TestSetupChannels) {
   EXPECT_EQ(this->blob_bottom_->width(), this->blob_top_0_->width());
 }
 
-TYPED_TEST(SliceLayerTest, TestTrivialSlice) {
-  // Test the trivial (single output) "slice" operation --
-  // should be the identity.
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SliceLayer<Dtype> layer(layer_param);
-  this->blob_top_vec_0_.resize(1);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_0_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_0_->shape());
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_EQ(this->blob_bottom_->cpu_data()[i],
-              this->blob_top_0_->cpu_data()[i]);
-  }
-}
-
 TYPED_TEST(SliceLayerTest, TestSliceAcrossNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -176,18 +161,6 @@ TYPED_TEST(SliceLayerTest, TestSliceAcrossChannels) {
   }
 }
 
-TYPED_TEST(SliceLayerTest, TestGradientTrivial) {
-  // Test the trivial (single output) "slice" operation --
-  // should be the identity.
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SliceLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  this->blob_top_vec_0_.resize(1);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_0_);
-}
-
 TYPED_TEST(SliceLayerTest, TestGradientAcrossNum) {
   typedef typename TypeParam::Dtype Dtype;
   // Gradient checks are slow; reduce blob size.
diff --git a/src/caffe/test/test_triplet_loss_layer b/src/caffe/test/test_triplet_loss_layer
new file mode 100644
index 00000000000..6c25ce9bd4b
--- /dev/null
+++ b/src/caffe/test/test_triplet_loss_layer
@@ -0,0 +1,125 @@
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class TripletLossLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  TripletLossLayerTest()
+      : blob_bottom_data_(new Blob<Dtype>(50, 1, 1, 1)),
+        blob_bottom_y_(new Blob<Dtype>(50, 1, 1, 1)),
+        blob_top_loss_(new Blob<Dtype>()) {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_min(-1.0);
+    filler_param.set_max(1.0);  // distances~=1.0 to test both sides of margin
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_data_);
+    blob_bottom_vec_.push_back(blob_bottom_data_);
+    for (int i = 0; i < blob_bottom_y_->count(); ++i) {
+      blob_bottom_y_->mutable_cpu_data()[i] = caffe_rng_rand() % 2;  // 0 or 1
+    }
+    blob_bottom_vec_.push_back(blob_bottom_y_);
+    blob_top_vec_.push_back(blob_top_loss_);
+  }
+  virtual ~TripletLossLayerTest() {
+    delete blob_bottom_data_;
+    delete blob_bottom_y_;
+    delete blob_top_loss_;
+  }
+
+  Blob<Dtype>* const blob_bottom_data_;
+  Blob<Dtype>* const blob_bottom_y_;
+  Blob<Dtype>* const blob_top_loss_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(TripletLossLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(TripletLossLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  TripletLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // manually compute to compare
+  const Dtype margin = layer_param.triplet_loss_param().margin();
+  const Dtype losstype = 0;//layer_param.triplet_loss_param().losstype();
+  const int num_triplets = 3;
+  const int num_set = this->blob_bottom_data_->num()/(2 + num_triplets);
+  const int channels = this->blob_bottom_data_->channels();
+  Dtype loss(0);
+  if (losstype == 0) {
+  for (int i = 0; i < num_set; ++i) {
+    Dtype dist_par(0);
+    for (int j = 0; j < channels; ++j) {
+      Dtype diff_pos = this->blob_bottom_data_->cpu_data()[(2+num_triplets)*i*channels+j] -
+        this->blob_bottom_data_->cpu_data()[((2+num_triplets)*i+1)*channels+j];
+      dist_par = diff_pos*diff_pos;
+      loss += dist_par;
+    }
+    for (int triplet = 0; triplet < num_triplets; ++triplet) {
+      Dtype dist_sq(0);
+      for (int j = 0; j < channels; ++j) {
+        Dtype diff_pos = this->blob_bottom_data_->cpu_data()[(2+num_triplets)*i*channels+j] -
+          this->blob_bottom_data_->cpu_data()[((2+num_triplets)*i+1)*channels+j];
+        dist_sq += diff_pos*diff_pos;
+        Dtype diff_neg = this->blob_bottom_data_->cpu_data()[(2+num_triplets)*i*channels+j] -
+          this->blob_bottom_data_->cpu_data()[((2+num_triplets)*i+2+triplet)*channels+j];
+        dist_sq -= diff_neg*diff_neg;
+      }
+      loss += std::max(margin + dist_sq, Dtype(0.0));
+    }
+  }
+  } /*else {
+  for (int i = 0; i < num; ++i) {
+    Dtype dist_sq(0);
+    Dtype dist_par(0);
+    for (int j = 0; j < channels; ++j) {
+      Dtype diff_pos = this->blob_bottom_data_i_->cpu_data()[i*channels+j] -
+          this->blob_bottom_data_j_->cpu_data()[i*channels+j];
+      dist_sq += diff_pos*diff_pos;
+      dist_sq += margin;
+      Dtype diff_neg = this->blob_bottom_data_i_->cpu_data()[i*channels+j] -
+          this->blob_bottom_data_k_->cpu_data()[i*channels+j];
+      dist_sq = 1 - diff_neg*diff_neg/dist_sq;
+      Dtype diff_par = this->blob_bottom_data_l_->cpu_data()[i*channels+j] -
+          this->blob_bottom_data_m_->cpu_data()[i*channels+j];
+      dist_par = diff_par*diff_par;
+    }
+    loss += std::max(dist_sq, Dtype(0.0));
+    loss += dist_par;
+  }
+  }*/
+  loss /= static_cast<Dtype>(num_set) * Dtype(2);
+  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0], loss, 1e-6);
+}
+
+TYPED_TEST(TripletLossLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  TripletLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
+  // check the gradient for the first 5 bottom layers
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+}  // namespace caffe
diff --git a/src/caffe/test/test_upgrade_proto.cpp b/src/caffe/test/test_upgrade_proto.cpp
index ee05b151e72..006720231a5 100644
--- a/src/caffe/test/test_upgrade_proto.cpp
+++ b/src/caffe/test/test_upgrade_proto.cpp
@@ -2892,7 +2892,6 @@ TEST_F(NetUpgradeTest, TestImageNet) {
   this->RunV1UpgradeTest(expected_v1_proto, expected_v2_proto);
 }  // NOLINT(readability/fn_size)
 
-#ifdef USE_OPENCV
 TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) {
   LayerParameter layer_param;
   shared_ptr<Layer<float> > layer;
@@ -2907,25 +2906,16 @@ TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) {
     layer_param.set_type(v2_layer_type);
     // Data layers expect a DB
     if (v2_layer_type == "Data") {
-      #ifdef USE_LEVELDB
       string tmp;
       MakeTempDir(&tmp);
       boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
       db->Open(tmp, db::NEW);
       db->Close();
       layer_param.mutable_data_param()->set_source(tmp);
-      #else
-      continue;
-      #endif  // USE_LEVELDB
     }
-    #ifndef USE_OPENCV
-    if (v2_layer_type == "ImageData" || v2_layer_type == "WindowData") {
-     continue;
-    }
-    #endif  // !USE_OPENCV
     layer = LayerRegistry<float>::CreateLayer(layer_param);
     EXPECT_EQ(v2_layer_type, layer->type());
   }
 }
-#endif  // USE_OPENCV
+
 }  // NOLINT(readability/fn_size)  // namespace caffe
diff --git a/src/caffe/util/db.cpp b/src/caffe/util/db.cpp
index ccda054d881..f55420e9840 100644
--- a/src/caffe/util/db.cpp
+++ b/src/caffe/util/db.cpp
@@ -8,31 +8,23 @@ namespace caffe { namespace db {
 
 DB* GetDB(DataParameter::DB backend) {
   switch (backend) {
-#ifdef USE_LEVELDB
   case DataParameter_DB_LEVELDB:
     return new LevelDB();
-#endif  // USE_LEVELDB
-#ifdef USE_LMDB
   case DataParameter_DB_LMDB:
     return new LMDB();
-#endif  // USE_LMDB
   default:
     LOG(FATAL) << "Unknown database backend";
   }
 }
 
 DB* GetDB(const string& backend) {
-#ifdef USE_LEVELDB
   if (backend == "leveldb") {
     return new LevelDB();
-  }
-#endif  // USE_LEVELDB
-#ifdef USE_LMDB
-  if (backend == "lmdb") {
+  } else if (backend == "lmdb") {
     return new LMDB();
+  } else {
+    LOG(FATAL) << "Unknown database backend";
   }
-#endif  // USE_LMDB
-  LOG(FATAL) << "Unknown database backend";
 }
 
 }  // namespace db
diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp
index f5c4d8a660d..06c46627d31 100644
--- a/src/caffe/util/db_leveldb.cpp
+++ b/src/caffe/util/db_leveldb.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_LEVELDB
 #include "caffe/util/db_leveldb.hpp"
 
 #include <string>
@@ -20,4 +19,3 @@ void LevelDB::Open(const string& source, Mode mode) {
 
 }  // namespace db
 }  // namespace caffe
-#endif  // USE_LEVELDB
diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp
index 78dd880ac41..a054b796806 100644
--- a/src/caffe/util/db_lmdb.cpp
+++ b/src/caffe/util/db_lmdb.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_LMDB
 #include "caffe/util/db_lmdb.hpp"
 
 #include <sys/stat.h>
@@ -50,4 +49,3 @@ void LMDBTransaction::Put(const string& key, const string& value) {
 
 }  // namespace db
 }  // namespace caffe
-#endif  // USE_LMDB
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index b0a7be50e5c..c48f31f35d4 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -1,7 +1,6 @@
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
-#include <vector>
 
 #include "caffe/util/im2col.hpp"
 #include "caffe/util/math_functions.hpp"
@@ -45,98 +44,6 @@ template void im2col_cpu<double>(const double* data_im, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_col);
 
-template <typename Dtype>
-inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
-    const int num_spatial_axes, const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_output) {
-  if (!im2col) {
-    int im_size = im_shape[0];
-    for (int i = 0; i < num_spatial_axes; ++i) {
-      im_size *= im_shape[1 + i];
-    }
-    caffe_set(im_size, Dtype(0), data_output);
-  }
-  int kernel_size = 1;
-  for (int i = 0; i < num_spatial_axes; ++i) {
-    kernel_size *= kernel_shape[i];
-  }
-  const int channels_col = col_shape[0];
-  vector<int> d_offset(num_spatial_axes, 0);
-  vector<int> d_iter(num_spatial_axes, 0);
-  for (int c = 0; c < channels_col; ++c) {
-    // Loop over spatial axes in reverse order to compute a per-axis offset.
-    int offset = c;
-    for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
-      if (d_i < num_spatial_axes - 1) {
-        offset /= kernel_shape[d_i + 1];
-      }
-      d_offset[d_i] = offset % kernel_shape[d_i];
-    }
-    for (bool incremented = true; incremented; ) {
-      // Loop over spatial axes in forward order to compute the indices in the
-      // image and column, and whether the index lies in the padding.
-      int index_col = c;
-      int index_im = c / kernel_size;
-      bool is_padding = false;
-      for (int d_i = 0; d_i < num_spatial_axes; ++d_i) {
-        const int d = d_iter[d_i];
-        const int d_pad = d * stride[d_i] - pad[d_i] + d_offset[d_i];
-        is_padding |= d_pad < 0 || d_pad >= im_shape[d_i + 1];
-        index_col *= col_shape[d_i + 1];
-        index_col += d;
-        index_im *= im_shape[d_i + 1];
-        index_im += d_pad;
-      }
-      if (im2col) {
-        if (is_padding) {
-          data_output[index_col] = 0;
-        } else {
-          data_output[index_col] = data_input[index_im];
-        }
-      } else if (!is_padding) {  // col2im
-        data_output[index_im] += data_input[index_col];
-      }
-      // Loop over spatial axes in reverse order to choose an index,
-      // like counting.
-      incremented = false;
-      for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
-        const int d_max = col_shape[d_i + 1];
-        DCHECK_LT(d_iter[d_i], d_max);
-        if (d_iter[d_i] == d_max - 1) {
-          d_iter[d_i] = 0;
-        } else {  // d_iter[d_i] < d_max - 1
-          ++d_iter[d_i];
-          incremented = true;
-          break;
-        }
-      }
-    }  // while(incremented) {
-  }  // for (int c = 0; c < channels_col; ++c) {
-}
-
-template <typename Dtype>
-void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col) {
-  const bool kIm2Col = true;
-  im2col_nd_core_cpu(data_im, kIm2Col, num_spatial_axes, im_shape, col_shape,
-                  kernel_shape, pad, stride, data_col);
-}
-
-// Explicit instantiation
-template void im2col_nd_cpu<float>(const float* data_im,
-    const int num_spatial_axes,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    float* data_col);
-template void im2col_nd_cpu<double>(const double* data_im,
-    const int num_spatial_axes,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    double* data_col);
-
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
@@ -173,27 +80,4 @@ template void col2im_cpu<double>(const double* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im);
 
-template <typename Dtype>
-void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im) {
-  const bool kIm2Col = false;
-  im2col_nd_core_cpu(data_col, kIm2Col, num_spatial_axes, im_shape, col_shape,
-                     kernel_shape, pad, stride, data_im);
-}
-
-// Explicit instantiation
-template void col2im_nd_cpu<float>(const float* data_col,
-    const int num_spatial_axes,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    float* data_im);
-template void col2im_nd_cpu<double>(const double* data_col,
-    const int num_spatial_axes,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    double* data_im);
-
-
 }  // namespace caffe
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index 5a478ba62d2..c90f93eb67b 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -59,6 +59,7 @@ void im2col_gpu(const Dtype* data_im, const int channels,
   CUDA_POST_KERNEL_CHECK;
 }
 
+
 // Explicit instantiation
 template void im2col_gpu<float>(const float* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
@@ -69,156 +70,6 @@ template void im2col_gpu<double>(const double* data_im, const int channels,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     double* data_col);
 
-template <typename Dtype, int num_axes>
-__global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col) {
-  int d_temp[num_axes];  // NOLINT(runtime/arrays)
-  int d_iter[num_axes];  // NOLINT(runtime/arrays)
-  int i;
-  CUDA_KERNEL_LOOP(index, n) {
-    // Initialize channel_in, computed in the loop below, with intermediate
-    // computations used to compute the spatial indices.
-    int channel_in = index;
-    int channel_out = 1;
-    for (i = num_axes - 1; i >= 0; --i) {
-      d_temp[i] = channel_in % col_shape[i + 1];
-      channel_in /= col_shape[i + 1];
-      channel_out *= kernel_shape[i];
-    }
-    channel_out *= channel_in;
-    int data_col_inc = 1;
-    for (i = 0; i < num_axes; ++i) {
-      channel_out *= col_shape[i + 1];
-      channel_out += d_temp[i];
-      d_temp[i] = d_temp[i] * stride[i] - pad[i];
-      channel_in *= im_shape[i + 1];
-      channel_in += d_temp[i];
-      data_col_inc *= col_shape[i + 1];
-      d_iter[i] = 0;
-    }
-    Dtype* data_col_ptr = data_col + channel_out;
-    const Dtype* data_im_ptr = data_im + channel_in;
-    bool incremented;
-    do {
-      bool in_range = true;
-      for (i = 0; i < num_axes; ++i) {
-        const int d_iter_im = d_iter[i] + d_temp[i];
-        in_range &= d_iter_im >= 0 && d_iter_im < im_shape[i + 1];
-        if (!in_range) { break; }
-      }
-      if (in_range) {
-        int data_im_offset = d_iter[0];
-        for (i = 1; i < num_axes; ++i) {
-          data_im_offset *= im_shape[i + 1];
-          data_im_offset += d_iter[i];
-        }
-        *data_col_ptr = data_im_ptr[data_im_offset];
-      } else {
-        *data_col_ptr = 0;
-      }
-      data_col_ptr += data_col_inc;
-      incremented = false;
-      for (i = num_axes - 1; i >= 0; --i) {
-        const int d_max = kernel_shape[i];
-        if (d_iter[i] == d_max - 1) {
-          d_iter[i] = 0;
-        } else {  // d_iter[i] < d_max - 1
-          ++d_iter[i];
-          incremented = true;
-          break;
-        }
-      }  // for (int i = num_axes - 1; i >= 0; --i)
-    } while (incremented);  // do
-  }  // CUDA_KERNEL_LOOP(index, n)
-}
-
-template <typename Dtype>
-void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes,
-    const int num_kernels, const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col) {
-  switch (num_spatial_axes) {
-  case 1:
-    im2col_nd_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  case 2:
-    im2col_nd_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  case 3:
-    im2col_nd_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  case 4:
-    im2col_nd_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  case 5:
-    im2col_nd_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  case 6:
-    im2col_nd_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  case 7:
-    im2col_nd_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  case 8:
-    im2col_nd_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  case 9:
-    im2col_nd_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  case 10:
-    im2col_nd_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
-    break;
-  default:
-    LOG(FATAL) << "im2col_nd_gpu does not support computation with "
-               << num_spatial_axes << " spatial axes";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-// Explicit instantiation
-template void im2col_nd_gpu<float>(const float* data_im,
-    const int num_spatial_axes, const int col_size,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    float* data_col);
-template void im2col_nd_gpu<double>(const double* data_im,
-    const int num_spatial_axes, const int col_size,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    double* data_col);
-
 template <typename Dtype>
 __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
     const int height, const int width, const int channels,
@@ -290,159 +141,4 @@ template void col2im_gpu<double>(const double* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im);
 
-template <typename Dtype, int num_axes>
-__global__ void col2im_nd_gpu_kernel(const int n, const Dtype* data_col,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im) {
-  int d_im[num_axes];  // NOLINT(runtime/arrays)
-  int d_col_iter[num_axes];  // NOLINT(runtime/arrays)
-  int d_col_start[num_axes];  // NOLINT(runtime/arrays)
-  int d_col_end[num_axes];  // NOLINT(runtime/arrays)
-  CUDA_KERNEL_LOOP(index, n) {
-    // Initialize channel_in, computed in the loop below, with intermediate
-    // computations used to compute the spatial indices.
-    int channel_im = index;
-    // Calculate d_im (image dimensions).
-    for (int i = num_axes - 1; i >= 0; --i) {
-      d_im[i] = channel_im % im_shape[i + 1] + pad[i];
-      channel_im /= im_shape[i + 1];
-    }
-    // Calculate col start/end indices.
-    bool done = false;
-    for (int i = 0; i < num_axes; ++i) {
-      d_col_start[i] = d_col_iter[i] =
-          (d_im[i] < kernel_shape[i]) ?
-          0 : (d_im[i] - kernel_shape[i]) / stride[i] + 1;
-      d_col_end[i] = min(d_im[i] / stride[i] + 1, col_shape[i + 1]);
-      if (d_col_start[i] >= d_col_end[i]) {
-        // Skip computation if the dimension is 0 at any spatial axis --
-        // final val will be 0.
-        data_im[index] = 0;
-        done = true;
-        break;  // for (int i = 0; i < num_axes; ++i)
-      }
-    }
-    if (done) {
-      continue;  // CUDA_KERNEL_LOOP(index, n)
-    }
-    // Loop over the col to compute the output val.
-    Dtype val = 0;
-    bool incremented = true;
-    do {
-      // Compute the final offset.
-      int final_offset = 0;
-      int kernel_shape_prod = 1;
-      for (int i = num_axes - 1; i >= 0; --i) {
-        final_offset +=
-            (d_im[i] - d_col_iter[i] * stride[i]) * kernel_shape_prod;
-        kernel_shape_prod *= kernel_shape[i];
-      }
-      final_offset += kernel_shape_prod * channel_im;
-      for (int i = 0; i < num_axes; ++i) {
-        final_offset *= col_shape[i + 1];
-        final_offset += d_col_iter[i];
-      }
-      val += data_col[final_offset];
-      incremented = false;
-      for (int i = num_axes - 1; i >= 0; --i) {
-        const int d_max = d_col_end[i];
-        if (d_col_iter[i] == d_max - 1) {
-          d_col_iter[i] = d_col_start[i];
-        } else {  // d_col_iter[i] < d_max - 1
-          ++d_col_iter[i];
-          incremented = true;
-          break;  // for (int i = num_axes - 1; i >= 0; --i)
-        }
-      }  // for (int i = num_axes - 1; i >= 0; --i)
-    }  while (incremented);
-    data_im[index] = val;
-  }  // CUDA_KERNEL_LOOP(index, n)
-}
-
-template <typename Dtype>
-void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes,
-    const int im_size, const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im) {
-  switch (num_spatial_axes) {
-  case 1:
-    col2im_nd_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  case 2:
-    col2im_nd_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  case 3:
-    col2im_nd_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  case 4:
-    col2im_nd_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  case 5:
-    col2im_nd_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  case 6:
-    col2im_nd_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  case 7:
-    col2im_nd_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  case 8:
-    col2im_nd_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  case 9:
-    col2im_nd_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  case 10:
-    col2im_nd_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
-    break;
-  default:
-    LOG(FATAL) << "col2im_nd_gpu does not support computation with "
-               << num_spatial_axes << " spatial axes";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-// Explicit instantiation
-template void col2im_nd_gpu<float>(const float* data_col,
-    const int num_spatial_axes, const int im_size,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    float* data_im);
-template void col2im_nd_gpu<double>(const double* data_col,
-    const int num_spatial_axes, const int im_size,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    double* data_im);
-
 }  // namespace caffe
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index f2b1dd98423..6f03314202c 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -3,11 +3,9 @@
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
 #include <opencv2/core/core.hpp>
-#ifdef USE_OPENCV
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/highgui/highgui_c.h>
 #include <opencv2/imgproc/imgproc.hpp>
-#endif  // USE_OPENCV
 #include <stdint.h>
 
 #include <algorithm>
@@ -69,7 +67,6 @@ void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
   CHECK(proto.SerializeToOstream(&output));
 }
 
-#ifdef USE_OPENCV
 cv::Mat ReadImageToCVMat(const string& filename,
     const int height, const int width, const bool is_color) {
   cv::Mat cv_img;
@@ -101,7 +98,6 @@ cv::Mat ReadImageToCVMat(const string& filename,
 cv::Mat ReadImageToCVMat(const string& filename) {
   return ReadImageToCVMat(filename, 0, 0, true);
 }
-
 // Do the file extension and encoding match?
 static bool matchExt(const std::string & fn,
                      std::string en) {
@@ -115,7 +111,6 @@ static bool matchExt(const std::string & fn,
     return true;
   return false;
 }
-
 bool ReadImageToDatum(const string& filename, const int label,
     const int height, const int width, const bool is_color,
     const std::string & encoding, Datum* datum) {
@@ -140,7 +135,6 @@ bool ReadImageToDatum(const string& filename, const int label,
     return false;
   }
 }
-#endif  // USE_OPENCV
 
 bool ReadFileToDatum(const string& filename, const int label,
     Datum* datum) {
@@ -162,7 +156,6 @@ bool ReadFileToDatum(const string& filename, const int label,
   }
 }
 
-#ifdef USE_OPENCV
 cv::Mat DecodeDatumToCVMatNative(const Datum& datum) {
   cv::Mat cv_img;
   CHECK(datum.encoded()) << "Datum not encoded";
@@ -234,5 +227,6 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
   }
   datum->set_data(buffer);
 }
-#endif  // USE_OPENCV
+
+
 }  // namespace caffe
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index ac379e50f4f..4703eb4c1b4 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -193,7 +193,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
     }
     if (v0_layer_param.has_pad()) {
       if (type == "conv") {
-        layer_param->mutable_convolution_param()->add_pad(v0_layer_param.pad());
+        layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad());
       } else if (type == "pool") {
         layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad());
       } else {
@@ -203,7 +203,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
     }
     if (v0_layer_param.has_kernelsize()) {
       if (type == "conv") {
-        layer_param->mutable_convolution_param()->add_kernel_size(
+        layer_param->mutable_convolution_param()->set_kernel_size(
             v0_layer_param.kernelsize());
       } else if (type == "pool") {
         layer_param->mutable_pooling_param()->set_kernel_size(
@@ -224,7 +224,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
     }
     if (v0_layer_param.has_stride()) {
       if (type == "conv") {
-        layer_param->mutable_convolution_param()->add_stride(
+        layer_param->mutable_convolution_param()->set_stride(
             v0_layer_param.stride());
       } else if (type == "pool") {
         layer_param->mutable_pooling_param()->set_stride(
@@ -816,6 +816,10 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
     layer_param->mutable_threshold_param()->CopyFrom(
         v1_layer_param.threshold_param());
   }
+  if (v1_layer_param.has_triplet_loss_param()) {
+    layer_param->mutable_triplet_loss_param()->CopyFrom(
+        v1_layer_param.triplet_loss_param());
+  }
   if (v1_layer_param.has_window_data_param()) {
     layer_param->mutable_window_data_param()->CopyFrom(
         v1_layer_param.window_data_param());
@@ -913,6 +917,8 @@ const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) {
     return "Slice";
   case V1LayerParameter_LayerType_TANH:
     return "TanH";
+  case V1LayerParameter_LayerType_TRIPLET_LOSS:
+    return "TripletLoss";
   case V1LayerParameter_LayerType_WINDOW_DATA:
     return "WindowData";
   case V1LayerParameter_LayerType_THRESHOLD:
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index e3f684b5ab3..ff63860a3c1 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -174,7 +174,6 @@ int train() {
   vector<int> gpus;
   get_gpus(&gpus);
   if (gpus.size() == 0) {
-    LOG(INFO) << "Use CPU.";
     Caffe::set_mode(Caffe::CPU);
   } else {
     ostringstream s;
diff --git a/tools/compute_image_mean.cpp b/tools/compute_image_mean.cpp
index 2035d515195..b1fc7cae38f 100644
--- a/tools/compute_image_mean.cpp
+++ b/tools/compute_image_mean.cpp
@@ -24,7 +24,6 @@ DEFINE_string(backend, "lmdb",
 int main(int argc, char** argv) {
   ::google::InitGoogleLogging(argv[0]);
 
-#ifdef USE_OPENCV
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
@@ -116,8 +115,5 @@ int main(int argc, char** argv) {
     }
     LOG(INFO) << "mean_value channel [" << c << "]:" << mean_values[c] / dim;
   }
-#else
-  LOG(FATAL) << "This tool requires OpenCV; compile with USE_OPENCV.";
-#endif  // USE_OPENCV
   return 0;
 }
diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp
index e51a2631077..aad1f1fe216 100644
--- a/tools/convert_imageset.cpp
+++ b/tools/convert_imageset.cpp
@@ -43,7 +43,6 @@ DEFINE_string(encode_type, "",
     "Optional: What type should we encode the image as ('png','jpg',...).");
 
 int main(int argc, char** argv) {
-#ifdef USE_OPENCV
   ::google::InitGoogleLogging(argv[0]);
   // Print output to stderr (while still logging)
   FLAGS_alsologtostderr = 1;
@@ -151,8 +150,5 @@ int main(int argc, char** argv) {
     txn->Commit();
     LOG(INFO) << "Processed " << count << " files.";
   }
-#else
-  LOG(FATAL) << "This tool requires OpenCV; compile with USE_OPENCV.";
-#endif  // USE_OPENCV
   return 0;
 }
diff --git a/tools/extract_features.cpp b/tools/extract_features.cpp
index 084c9bf88df..365dd495bbf 100644
--- a/tools/extract_features.cpp
+++ b/tools/extract_features.cpp
@@ -42,7 +42,7 @@ int feature_extraction_pipeline(int argc, char** argv) {
     "  save_feature_dataset_name1[,name2,...]  num_mini_batches  db_type"
     "  [CPU/GPU] [DEVICE_ID=0]\n"
     "Note: you can extract multiple features in one pass by specifying"
-    " multiple feature blob names and dataset names separated by ','."
+    " multiple feature blob names and dataset names seperated by ','."
     " The names cannot contain white space characters and the number of blobs"
     " and datasets must be equal.";
     return 1;