From 1bd0c798a4cef5e63b6ecd7ff19e7364cfb188fb Mon Sep 17 00:00:00 2001
From: Wangyida <wangyida123@outlook.com>
Date: Thu, 3 Dec 2015 11:08:24 +0800
Subject: [PATCH] Implement CNN Triplet training.

---
 examples/triplet/convert_triplet_data.cpp     | 236 ++++++
 examples/triplet/create_3d_triplet.sh         |  27 +
 examples/triplet/multipie_triplet.prototxt    | 110 +++
 .../triplet/multipie_triplet_solver.prototxt  |  25 +
 .../multipie_triplet_train_test.prototxt      | 191 +++++
 examples/triplet/objtrans.py                  |  18 +
 examples/triplet/pascal3d_triplet.prototxt    | 295 ++++++++
 .../triplet/pascal3d_triplet_solver.prototxt  |  25 +
 .../pascal3d_triplet_train_test.prototxt      | 335 +++++++++
 examples/triplet/readme.md                    |  97 +++
 examples/triplet/train_multipie_triplet.sh    |   8 +
 examples/triplet/train_pascal_triplet.sh      |   5 +
 include/caffe/layers/triplet_loss_layer.hpp   |  78 ++
 src/caffe/layers/triplet_loss_layer.cpp       | 684 ++++++++++++++++++
 src/caffe/layers/triplet_loss_layer.cu        | 650 +++++++++++++++++
 src/caffe/proto/caffe.proto                   |  12 +-
 src/caffe/test/test_triplet_loss_layer.cpp    | 128 ++++
 17 files changed, 2923 insertions(+), 1 deletion(-)
 create mode 100644 examples/triplet/convert_triplet_data.cpp
 create mode 100644 examples/triplet/create_3d_triplet.sh
 create mode 100644 examples/triplet/multipie_triplet.prototxt 
 create mode 100644 examples/triplet/multipie_triplet_solver.prototxt
 create mode 100644 examples/triplet/multipie_triplet_train_test.prototxt
 create mode 100644 examples/triplet/objtrans.py
 create mode 100644 examples/triplet/pascal3d_triplet.prototxt
 create mode 100644 examples/triplet/pascal3d_triplet_solver.prototxt
 create mode 100644 examples/triplet/pascal3d_triplet_train_test.prototxt
 create mode 100644 examples/triplet/readme.md
 create mode 100644 examples/triplet/train_multipie_triplet.sh
 create mode 100644 examples/triplet/train_pascal_triplet.sh
 create mode 100644 include/caffe/layers/triplet_loss_layer.hpp
 create mode 100644 src/caffe/layers/triplet_loss_layer.cpp
 create mode 100755 src/caffe/layers/triplet_loss_layer.cu
 create mode 100644 src/caffe/test/test_triplet_loss_layer.cpp
diff --git a/examples/triplet/convert_triplet_data.cpp b/examples/triplet/convert_triplet_data.cpp
new file mode 100644
index 00000000000..7eb62479b32
--- /dev/null
+++ b/examples/triplet/convert_triplet_data.cpp
@@ -0,0 +1,236 @@
+// Usage:
+// convert_3d_data input_image_file input_label_file output_db_file
+// Codes are disigned for binary files including data and label. You can modify
+// the condition if information for arranging training data is not the same with
+// category and pose of object.
+#include <fstream>  // NOLINT(readability/streams)
+#include <string>
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/math_functions.hpp"
+#include "glog/logging.h"
+#include "google/protobuf/text_format.h"
+#ifdef USE_LEVELDB
+#include "leveldb/db.h"
+#include "math.h"
+#include "stdint.h"
+
+uint32_t swap_endian(uint32_t val) {
+    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
+    return (val << 16) | (val >> 16);
+}
+
+void read_image(std::ifstream* image_file, std::ifstream* label_file,
+        uint32_t index, uint32_t rows, uint32_t cols,
+        char* pixels, char* label_temp, signed char* label, int rgb_use) {
+  if (rgb_use == 0) {
+    image_file->seekg(index * rows * cols + 16);
+    image_file->read(pixels, rows * cols);
+    label_file->seekg(index * 4 + 8);  // 4 = 1 catory label+3 coordinate label
+    label_file->read(label_temp, 4);
+    for (int i = 0; i < 4; i++)
+      *(label+i) = (signed char)*(label_temp+i);
+  } else {
+    image_file->seekg(3 * index * rows * cols + 16);
+    image_file->read(pixels, 3 * rows * cols);
+    label_file->seekg(index * 4 + 8);  // 4 = 1 catory label+3 coordinate label
+    label_file->read(label_temp, 4);
+    for (int i = 0; i < 4; i++)
+      *(label+i) = (signed char)*(label_temp+i);
+  }
+}
+
+void convert_dataset(const char* image_filename, const char* label_filename,
+        const char* db_filename,
+                     const char* class_number, const char* rgb_use) {
+  int rgb_use1 = atoi(rgb_use);
+  int class_num = atoi(class_number);
+  // Open files
+  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
+  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
+  CHECK(image_file) << "Unable to open file " << image_filename;
+  CHECK(label_file) << "Unable to open file " << label_filename;
+  // Read the magic and the meta data
+  uint32_t magic;
+  uint32_t num_items;
+  uint32_t num_labels;
+  uint32_t rows;
+  uint32_t cols;
+
+  image_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
+  label_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2050) << "Incorrect label file magic.";
+  image_file.read(reinterpret_cast<char*>(&num_items), 4);
+  num_items = swap_endian(num_items);
+  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
+  num_labels = swap_endian(num_labels);
+  CHECK_EQ(num_items, num_labels);
+  image_file.read(reinterpret_cast<char*>(&rows), 4);
+  rows = swap_endian(rows);
+  image_file.read(reinterpret_cast<char*>(&cols), 4);
+  cols = swap_endian(cols);
+
+  // Open leveldb
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.create_if_missing = true;
+  options.error_if_exists = true;
+  leveldb::Status status = leveldb::DB::Open(
+      options, db_filename, &db);
+  CHECK(status.ok()) << "Failed to open leveldb " << db_filename
+      << ". Is it already existing?";
+
+  char* label_temp = new char[4];  // label for unsigned char*
+  signed char* label_i = new signed char[4];  // label for triplet
+  signed char* label_j = new signed char[4];
+  signed char* label_k = new signed char[4];
+  signed char* label_l = new signed char[4];  // label for pair wise
+  signed char* label_m = new signed char[4];
+  int db_size;
+  if (rgb_use1 == 0)
+    db_size = rows * cols;
+  else
+    db_size = 3 * rows * cols;
+  char* pixels1 = new char[db_size];
+  char* pixels2 = new char[db_size];
+  char* pixels3 = new char[db_size];
+  char* pixels4 = new char[db_size];
+  char* pixels5 = new char[db_size];
+  const int kMaxKeyLength = 10;
+  char key[kMaxKeyLength];
+  std::string value;
+  caffe::Datum datum;
+  if (rgb_use1 == 0)
+    datum.set_channels(1);
+  else
+    datum.set_channels(3);
+  datum.set_height(rows);
+  datum.set_width(cols);
+  LOG(INFO) << "A total of " << num_items << " items.";
+  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
+  int counter = 0;
+  // This codes selecting 1 positive sample and 3 negative samples for a triplet
+  // set. We randomly select data and decide whether concatenating data set to
+  // DB file according to labels.
+  for (unsigned int times = 0; times < 10; ++times) {
+    // iteration in the samples of all class
+    for (unsigned int itemid = 0; itemid < num_items/class_num; ++itemid) {
+      // iteration in the samples in one class
+      for (unsigned int class_ind = 0; class_ind < class_num; ++class_ind) {
+      // use reference sample one by one at each iteration
+      int i = itemid % num_items + class_ind*num_items/class_num;
+      int j = caffe::caffe_rng_rand() % num_items;  // pick triplet groups
+      int k = caffe::caffe_rng_rand() % num_items;
+      int l = caffe::caffe_rng_rand() % num_items;  // pick pair wise groups
+      int m = caffe::caffe_rng_rand() % num_items;
+      read_image(&image_file, &label_file, i, rows, cols,  // read triplet
+        pixels1, label_temp, label_i, rgb_use1);
+      read_image(&image_file, &label_file, j, rows, cols,
+        pixels2, label_temp, label_j, rgb_use1);
+      read_image(&image_file, &label_file, k, rows, cols,
+        pixels3, label_temp, label_k, rgb_use1);
+      read_image(&image_file, &label_file, l, rows, cols,  // read pair wise
+        pixels4, label_temp, label_l, rgb_use1);
+      read_image(&image_file, &label_file, m, rows, cols,
+        pixels5, label_temp, label_m, rgb_use1);
+
+      bool pair_pass = false;
+      bool triplet1_pass = false;
+      bool triplet2_pass = false;
+      bool triplet3_class_same = false;
+      bool triplet3_pass = false;
+
+      int ij_diff_x = static_cast<int>(*(label_i+1)-*(label_j+1));
+      int ij_diff_y = static_cast<int>(*(label_i+2)-*(label_j+2));
+      int ij_diff_z = static_cast<int>(*(label_i+3)-*(label_j+3));
+      int im_diff_x = static_cast<int>(*(label_i+1)-*(label_m+1));
+      int im_diff_y = static_cast<int>(*(label_i+2)-*(label_m+2));
+      int im_diff_z = static_cast<int>(*(label_i+3)-*(label_m+3));
+
+      int ij_x = ij_diff_x*ij_diff_x;
+      int ij_y = ij_diff_y*ij_diff_y;
+      int ij_z = ij_diff_z*ij_diff_z;
+      int im_x = im_diff_x*im_diff_x;
+      int im_y = im_diff_y*im_diff_y;
+      int im_z = im_diff_z*im_diff_z;
+
+      float dist_ij = std::sqrt(ij_x + ij_y + ij_z);
+      float dist_im = std::sqrt(im_x + im_y + im_z);
+      // Arrange training data according to conditionals including category
+      // and pose of synthetic data, dist_* could be ignored if you
+      // only concentrate on category.
+      if (*label_i == *label_j && dist_ij < 100/3 && dist_ij != 0)
+        pair_pass = true;
+      if (pair_pass && (*label_i  != *label_k))
+        triplet1_pass = true;
+      if (pair_pass && (*label_i  != *label_l))
+        triplet2_pass = true;
+      if (pair_pass && (*label_i  == *label_m))
+        triplet3_class_same = true;
+      if (triplet3_class_same && dist_im > 100/3)
+        triplet3_pass = true;
+      if (pair_pass && triplet1_pass && triplet2_pass && triplet3_pass) {
+        datum.set_data(pixels1, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_i));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+        datum.set_data(pixels2, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_j));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+        datum.set_data(pixels3, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_k));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+        datum.set_data(pixels4, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_l));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+        datum.set_data(pixels5, db_size);  // set data
+        datum.set_label(static_cast<int>(*label_m));
+        datum.SerializeToString(&value);
+        snprintf(key, kMaxKeyLength, "%08d", counter);
+        db->Put(leveldb::WriteOptions(), std::string(key), value);
+        counter++;
+      } else {
+        class_ind--;
+      }
+      }  // iteration in the samples of all class
+    }  // iteration in the samples in one class
+  }  // iteration in times
+  delete db;
+  delete pixels1;
+  delete pixels2;
+  delete pixels3;
+  delete pixels4;
+  delete pixels5;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 6) {
+    printf("This script converts the dataset to the leveldb format used\n"
+           "by caffe to train a triplet network.\n"
+           "Usage:\n"
+           "    convert_3d_data input_image_file input_label_file "
+           "output_db_file class_number rgb_use \n");
+  } else {
+    google::InitGoogleLogging(argv[0]);
+    convert_dataset(argv[1], argv[2], argv[3], argv[4], argv[5]);
+  }
+  return 0;
+}
+#else
+int main(int argc, char** argv) {
+    LOG(FATAL) << "This example requires LevelDB; compile with USE_LEVELDB.";
+}
+#endif  // USE_LEVELDB
diff --git a/examples/triplet/create_3d_triplet.sh b/examples/triplet/create_3d_triplet.sh
new file mode 100644
index 00000000000..376a38dc49f
--- /dev/null
+++ b/examples/triplet/create_3d_triplet.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env sh
+# This script converts the mnist data into leveldb format.
+
+EXAMPLES=./build/examples/triplet
+DATA=./data/linemod
+
+echo "Creating leveldb..."
+
+# this script taking data which consist of 6 categories to leveldb format for
+# tripplet training.
+
+rm -rf ./examples/triplet/3d_triplet_train_leveldb
+rm -rf ./examples/triplet/3d_triplet_test_leveldb
+
+$EXAMPLES/convert_3d_triplet_data.bin \
+    $DATA/binary_image_train \
+    $DATA/binary_label_train \
+    ./examples/triplet/3d_triplet_train_leveldb \
+    6 \
+    0
+$EXAMPLES/convert_3d_triplet_data.bin \
+    $DATA/binary_image_test \
+    $DATA/binary_label_test \
+    ./examples/triplet/3d_triplet_test_leveldb \
+    6 \
+    0
+echo "Done."
diff --git a/examples/triplet/multipie_triplet.prototxt  b/examples/triplet/multipie_triplet.prototxt 
new file mode 100644
index 00000000000..267ddc3475e
--- /dev/null
+++ b/examples/triplet/multipie_triplet.prototxt 	
@@ -0,0 +1,110 @@
+name: "multipie_triplet"
+input: "data"
+input_dim: 1
+input_dim: 1
+input_dim: 75
+input_dim: 65
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 8
+    stride: 1
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "pool1"
+  top: "pool1"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 7
+    kernel_size: 5
+    stride: 1
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "pool2"
+  top: "pool2"
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool2"
+  top: "ip1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 256
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "ip1"
+  top: "ip1"
+}
+layer {
+  name: "feat"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "feat"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 150
+  }
+}
diff --git a/examples/triplet/multipie_triplet_solver.prototxt b/examples/triplet/multipie_triplet_solver.prototxt
new file mode 100644
index 00000000000..22ab32d6fe2
--- /dev/null
+++ b/examples/triplet/multipie_triplet_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/triplet/multipie_triplet_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of face database, we have test batch size 250 and 250 test iterations: 50*(2+3)=250,
+#
+test_iter: 500
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.001
+momentum: 0.9
+weight_decay: 0.0000
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.001
+power: 0.75
+# Display every 500 iterations
+display: 500
+# The maximum number of iterations
+max_iter: 500000
+# snapshot intermediate results
+snapshot: 10000
+snapshot_prefix: "examples/triplet/multipie_triplet"
+# solver mode: CPU or GPU
+solver_mode: CPU
diff --git a/examples/triplet/multipie_triplet_train_test.prototxt b/examples/triplet/multipie_triplet_train_test.prototxt
new file mode 100644
index 00000000000..7f457ebc91d
--- /dev/null
+++ b/examples/triplet/multipie_triplet_train_test.prototxt
@@ -0,0 +1,191 @@
+name: "multipie_triplet_train_test"
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  image_data_param {
+    source: "/Users/yidawang/Documents/MATLAB/TripletTest/Face/testlist.txt"
+    batch_size: 250
+  }
+}
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  image_data_param {
+    source: "/Users/yidawang/Documents/MATLAB/TripletTest/Face/testlist.txt"
+    batch_size: 250
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    name: "conv1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv1_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 8
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "pool1"
+  top: "pool1"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    name: "conv2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv2_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 7
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "pool2"
+  top: "pool2"
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool2"
+  top: "ip1"
+  param {
+    name: "ip1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip1_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 256
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "ip1"
+  top: "ip1"
+}
+layer {
+  name: "feat"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "feat"
+  param {
+    name: "feat_w"
+    lr_mult: 1
+  }
+  param {
+    name: "feat_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 150
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "feat"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "TripletLoss"
+  bottom: "feat"
+  bottom: "label"
+  top: "loss"
+  triplet_loss_param {
+    margin: 1
+    losstype: 0
+    num_negatives: 3
+  }
+}
diff --git a/examples/triplet/objtrans.py b/examples/triplet/objtrans.py
new file mode 100644
index 00000000000..5de5268b725
--- /dev/null
+++ b/examples/triplet/objtrans.py
@@ -0,0 +1,18 @@
+import bpy
+for i in range(1, 51):
+	override = {'selected_bases': list(bpy.context.scene.object_bases)}
+	bpy.ops.object.delete(override)
+	dir_from='/Users/yidawang/Downloads/temp/motorbike/'+str(i)+'/'+str(i)+'.obj'
+	dir_to='/Users/yidawang/Downloads/collection/motorbike/'+str(i)+'/'+str(i)+'.obj'
+	glob_type=str(i)+'.obj;'+str(i)+'.mtl'
+	scene = bpy.context.scene
+	lamp_data = bpy.data.lamps.new(name="New Lamp", type='HEMI')
+	lamp_object = bpy.data.objects.new(name="New Lamp", object_data=lamp_data)
+	scene.objects.link(lamp_object)
+	lamp_object.location = (0, 0, 1)
+	lamp_object.select = True
+	scene.objects.active = lamp_object
+	bpy.ops.import_scene.obj(filepath=dir_from,filter_glob=glob_type)
+	bpy.ops.object.shade_flat()
+	bpy.ops.export_scene.obj(filepath=dir_to,filter_glob=glob_type)
+
diff --git a/examples/triplet/pascal3d_triplet.prototxt b/examples/triplet/pascal3d_triplet.prototxt
new file mode 100644
index 00000000000..7163791faaf
--- /dev/null
+++ b/examples/triplet/pascal3d_triplet.prototxt
@@ -0,0 +1,295 @@
+name: "pascal_triplet"
+input: "data"
+input_dim: 1
+input_dim: 3
+input_dim: 227
+input_dim: 227
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    name: "conv1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv1_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 7
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "bn1"
+  type: "BatchNorm"
+  bottom: "conv1"
+  top: "bn1"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "bn1"
+  top: "bn1"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "bn1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    name: "conv2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv2_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 30
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "bn2"
+  type: "BatchNorm"
+  bottom: "conv2"
+  top: "bn2"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "bn2"
+  top: "bn2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "bn2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    name: "conv3_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv3_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 28
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "bn3"
+  type: "BatchNorm"
+  bottom: "conv3"
+  top: "bn3"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "bn3"
+  top: "bn3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "bn3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4"
+  param {
+    name: "conv4_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv4_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 14
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "bn4"
+  type: "BatchNorm"
+  bottom: "conv4"
+  top: "bn4"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "bn4"
+  top: "bn4"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "bn4"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool4"
+  top: "ip1"
+  param {
+    name: "ip1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip1_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 512
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "ip1"
+  top: "ip1"
+}
+layer {
+  name: "drop1"
+  type: "Dropout"
+  bottom: "ip1"
+  top: "ip1"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "feat"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "feat"
+  param {
+    name: "feat_w"
+    lr_mult: 1
+  }
+  param {
+    name: "feat_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 100
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
diff --git a/examples/triplet/pascal3d_triplet_solver.prototxt b/examples/triplet/pascal3d_triplet_solver.prototxt
new file mode 100644
index 00000000000..7e0789cf23b
--- /dev/null
+++ b/examples/triplet/pascal3d_triplet_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "examples/triplet/pascal3d_triplet_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of face database, we have test batch size 250 and 250 test iterations: 50*(2+3)=250,
+#
+test_iter: 500
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.001
+momentum: 0.9
+weight_decay: 0.0000
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.001
+power: 0.75
+# Display every 500 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 50000
+# snapshot intermediate results
+snapshot: 500
+snapshot_prefix: "examples/triplet/pascal_triplet"
+# solver mode: CPU or GPU
+solver_mode: CPU
diff --git a/examples/triplet/pascal3d_triplet_train_test.prototxt b/examples/triplet/pascal3d_triplet_train_test.prototxt
new file mode 100644
index 00000000000..26010099b23
--- /dev/null
+++ b/examples/triplet/pascal3d_triplet_train_test.prototxt
@@ -0,0 +1,335 @@
+name: "pascal_triplet_train_test"
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  image_data_param {
+    source: "/Users/yidawang/Documents/MATLAB/TripletTest/Face/testlist_regular.txt"
+    batch_size: 250
+  }
+}
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  image_data_param {
+    source: "/Users/yidawang/Documents/MATLAB/TripletTest/Face/testlist_regular.txt"
+    batch_size: 25
+  }
+}
+
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    name: "conv1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv1_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 7
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "bn1"
+  type: "BatchNorm"
+  bottom: "conv1"
+  top: "bn1"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "bn1"
+  top: "bn1"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "bn1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    name: "conv2_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv2_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 30
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "bn2"
+  type: "BatchNorm"
+  bottom: "conv2"
+  top: "bn2"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "bn2"
+  top: "bn2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "bn2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    name: "conv3_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv3_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 28
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "bn3"
+  type: "BatchNorm"
+  bottom: "conv3"
+  top: "bn3"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "bn3"
+  top: "bn3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "bn3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4"
+  param {
+    name: "conv4_w"
+    lr_mult: 1
+  }
+  param {
+    name: "conv4_b"
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 14
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "bn4"
+  type: "BatchNorm"
+  bottom: "conv4"
+  top: "bn4"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "bn4"
+  top: "bn4"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "bn4"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool4"
+  top: "ip1"
+  param {
+    name: "ip1_w"
+    lr_mult: 1
+  }
+  param {
+    name: "ip1_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 512
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "ip1"
+  top: "ip1"
+}
+layer {
+  name: "drop1"
+  type: "Dropout"
+  bottom: "ip1"
+  top: "ip1"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "feat"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "feat"
+  param {
+    name: "feat_w"
+    lr_mult: 1
+  }
+  param {
+    name: "feat_b"
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 100
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "TripletLoss"
+  bottom: "feat"
+  bottom: "label"
+  top: "loss"
+  triplet_loss_param {
+    margin: 0.2
+    losstype: 2
+    num_negatives: 3
+  }
+}
\ No newline at end of file
diff --git a/examples/triplet/readme.md b/examples/triplet/readme.md
new file mode 100644
index 00000000000..8ed0bcf7819
--- /dev/null
+++ b/examples/triplet/readme.md
@@ -0,0 +1,97 @@
+---
+title: Triplet and Pair Wise Network Tutorial
+description: Train and test a triplet network on data generated by 3D model from PASCAL3D+ or faces in MultiPIE.
+category: example
+include_in_docs: true
+layout: default
+priority: 100
+---
+
+# Triplet Network Training with Caffe
+This example shows how you can use weight sharing and a triplet loss
+function to learn a model using a triplet network in Caffe.
+
+We will assume that you have caffe successfully compiled. If not, please refer
+to the [Installation page](../../installation.html).
+
+## Training Data Preparation
+
+For reason that the triplet loss is widely used in multitask training where training samples
+are labeled regarding to different aspects, labels of samples are not necessary because
+only sequence of training samples matters for triplet loss(such as r1, p1, n1,...,rN ,pN, nN).
+1 reference samples and 1 positive sample is fixed in a triplet set, and the
+number of negative samples could be set free. So you can use a file list for training
+and testing samples. The triplet loss layer could be difined as below:
+
+layer {
+  name: "loss"
+  type: "TripletLoss"
+  bottom: "feat"
+  bottom: "label"
+  top: "loss"
+  triplet_loss_param {
+    margin: 0.2
+    losstype: 2
+    num_negatives: 3
+  }
+}
+
+where the num_negatives is the number of negative sampels in 1 triplet set,
+this set uses 5 samples in total, so the over all training samples must be multiples of 5.
+
+## Introduction to the convert_triplet_data tool
+
+If training based on DB files is needed, here I attach a tool used for converting
+binary files including data and label to levelDB database. Triplet loss is based on
+particular sequence of training data, so the labels of each sample is used for the
+arrangement of training data. This codes convert a set of binary synthetic data
+and label(catogory and pose) to levelDB files and arrange them as triplet set consist
+of 1 positive sample and 3 negative samples.
+
+You should modify label reading method according to binaryfile in `read_image` function
+and conditionals in `convert_dataset` function.
+
+## Models
+First, we will define the model that we want to train using the triplet network.
+We will use the convolutional net defined in
+`./examples/triplet/pascal3d_triplet.prototxt`.
+
+## Define the triplet Network
+
+In this section we will define the triplet network used for training. The
+resulting network is defined in
+`./examples/triplet/pascal3d_triplet_train_test.prototxt`.
+
+### Adding the Triplet Loss Function
+
+To train the network we will optimize a triplet loss function proposed in:
+This cost function is implemented with the `TRIPLET_LOSS` layer,
+the num_negatives could be set free:
+
+
+layer {
+  name: "loss"
+  type: "TripletLoss"
+  bottom: "feat"
+  bottom: "sim"
+  top: "loss"
+  triplet_loss_param {
+    margin: 1
+    losstype: 0
+    num_negatives: 3
+  }
+}
+
+## Define the Solver
+
+Nothing special needs to be done to the solver besides pointing it at the
+correct model file. The solver is defined in
+`./examples/triplet/pascal3d_triplet_solver.prototxt`.
+
+## Training and Testing the Model
+
+Training the model is simple after you have written the network definition
+protobuf and solver protobuf files. Simply run
+`./examples/triplet/train_pascal3d_triplet.sh`:
+
+    ./examples/triplet/train_pascal3d_triplet.sh
diff --git a/examples/triplet/train_multipie_triplet.sh b/examples/triplet/train_multipie_triplet.sh
new file mode 100644
index 00000000000..6c73c2989ff
--- /dev/null
+++ b/examples/triplet/train_multipie_triplet.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env sh
+# This script training in MULTIPIE database which takes 1 positive sample and 3
+# negative samples as training data set, the negative samples are ones which are
+# different from reference sample.
+
+TOOLS=./build/tools
+
+$TOOLS/caffe train --solver=examples/triplet/multipie_triplet_solver.prototxt
\ No newline at end of file
diff --git a/examples/triplet/train_pascal_triplet.sh b/examples/triplet/train_pascal_triplet.sh
new file mode 100644
index 00000000000..c5924dab79a
--- /dev/null
+++ b/examples/triplet/train_pascal_triplet.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env sh
+
+TOOLS=./build/tools
+
+$TOOLS/caffe train --solver=examples/triplet/pascal3d_triplet_solver.prototxt
\ No newline at end of file
diff --git a/include/caffe/layers/triplet_loss_layer.hpp b/include/caffe/layers/triplet_loss_layer.hpp
new file mode 100644
index 00000000000..9690a8a1154
--- /dev/null
+++ b/include/caffe/layers/triplet_loss_layer.hpp
@@ -0,0 +1,78 @@
+#ifndef CAFFE_TRIPLET_LOSS_LAYER_HPP_
+#define CAFFE_TRIPLET_LOSS_LAYER_HPP_
+#include <vector>
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/loss_layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+namespace caffe {
+
+/**
+ * @brief Compute triplet loss with multiple negative samples.
+ */
+template <typename Dtype>
+  class TripletLossLayer : public LossLayer<Dtype> {
+   public:
+    explicit TripletLossLayer(const LayerParameter& param)
+    : LossLayer<Dtype>(param), diff_() {}
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+                            const vector<Blob<Dtype>*>& top);
+    virtual inline int ExactNumBottomBlobs() const { return 2; }
+    virtual inline const char* type() const { return "TripletLoss"; }
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return bottom_index != 1;
+    }
+
+   protected:
+    /// @copydoc TripletLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+                             const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+                             const vector<Blob<Dtype>*>& top);
+    /**
+     * There are 3 types of triplet loss which are:
+     * L_0(x_a, x_p, x_n) = max(0, m - || x_a - x_n ||_2^2 + || x_a - x_p ||_2^2)
+     * used in FaceNet: A Unified Embedding for Face Recognition and Clustering
+     * L_1(x_a, x_p, x_n) = max(0, 1 - || x_a - x_n ||_2^2 / (|| x_a - x_p ||_2^2 + m))
+     * used in Learning Descriptors for Object Recognition and 3D Pose Estimation
+     * L_2(x_a, x_p, x_n) = max(0, 1 - exp(|| x_a - x_n ||_2^2) / (exp(|| x_a - x_p ||_2^2) + m))
+     * used in Learning Descriptors for Object Recognition and 3D Pose Estimation
+     */
+
+    /** One of the partial deriviation is:
+     * \begin{eqnarray}
+     * \mathcal{L}_{tri}(s_i,s_j,s_k) = max(0,1-\frac{||f(x_i)-f(x_k)||_2^2}{||f(x_i)-f(x_j)||_2^2+m})
+     * \end{eqnarray}®
+     * where $ f(x) $ is the input of the loss layer for sample $ x $ and m is the margin for triplet.
+     * Denote that $D_{ij}=||f(x_i)-f(x_j)||_2^2$ and $D_{ik}=||f(x_i)-f(x_k)||_2^2$,
+     * so the partial differential equations for the input of triplet loss layer are:
+     * \begin{eqnarray}
+     * \dfrac{\partial \mathcal{L}_{tri}}{\partial f(x_i)}=
+     * &\frac{D_{ik}(f(x_i)-f(x_j))-(D_{ij}+m)(f(x_i)-f(x_k))}{(D_{ij}+m)^2} \nonumber \\
+     * \dfrac{\partial \mathcal{L}_{tri}}{\partial f(x_j)}=
+     * &\frac{D_{ik}(f(x_j)-f(x_i))}{(D_{ij}+m)^2} \nonumber \\
+     * \dfrac{\partial \mathcal{L}_{tri}}{\partial f(x_k)}=
+     * &\frac{f(x_i)-f(x_k)}{D_{ij}+m}
+     * \end{eqnarray}®
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+                              const vector<bool>& propagate_down,
+                              const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+                              const vector<bool>& propagate_down,
+                              const vector<Blob<Dtype>*>& bottom);
+    Blob<Dtype> diff_;  // cached for backward pass
+    Blob<Dtype> diff_pos;
+    Blob<Dtype> diff_neg;
+    Blob<Dtype> dist_sq_;  // cached for backward pass
+    Blob<Dtype> dist_sq_pos;
+    Blob<Dtype> dist_sq_neg;
+    Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
+    Blob<Dtype> diff_sq_pos;
+    Blob<Dtype> diff_sq_neg;
+    Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
+  };
+
+}  // namespace caffe
+
+#endif  // CAFFE_TRIPLET_LOSS_LAYER_HPP_
diff --git a/src/caffe/layers/triplet_loss_layer.cpp b/src/caffe/layers/triplet_loss_layer.cpp
new file mode 100644
index 00000000000..8fc6fc6bd5e
--- /dev/null
+++ b/src/caffe/layers/triplet_loss_layer.cpp
@@ -0,0 +1,684 @@
+#include <algorithm>
+#include <vector>
+#include "caffe/layer.hpp"
+#include "caffe/layers/triplet_loss_layer.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/math_functions.hpp"
+namespace caffe {
+template <typename Dtype>
+void TripletLossLayer<Dtype>::LayerSetUp(
+  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  LossLayer<Dtype>::LayerSetUp(bottom, top);
+  // number of triplet in a batch
+  int num_negatives = this->layer_param_.triplet_loss_param().num_negatives();
+  // dimension of each descriptor
+  int dim = bottom[0]->count()/bottom[0]->num();
+  CHECK_EQ(bottom[0]->channels(), dim);
+  CHECK_EQ(bottom[0]->height(), 1);
+  CHECK_EQ(bottom[0]->width(), 1);
+  CHECK_EQ(bottom[1]->channels(), 1);
+  CHECK_EQ(bottom[1]->height(), 1);
+  CHECK_EQ(bottom[1]->width(), 1);
+  // In each set, we have:
+  // the descriptor of reference sample, closest sample, and negative samples
+  // number of sets in the whole batch
+  int num_set = bottom[0]->num()/(2 + num_negatives);
+  dist_sq_.Reshape(num_set, 1, 1, 1);
+  diff_pos.Reshape(num_set, dim, 1, 1);
+  dist_sq_pos.Reshape(num_set, 1, 1, 1);
+  diff_neg.Reshape(num_set, dim, 1, 1);
+  dist_sq_neg.Reshape(num_set, 1, 1, 1);
+  // vector of ones used to sum along channels
+  summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1);
+  for (int i = 0; i < bottom[0]->channels(); ++i)
+    summer_vec_.mutable_cpu_data()[i] = Dtype(1);
+}
+template <typename Dtype>
+void TripletLossLayer<Dtype>::Forward_cpu(
+  const vector<Blob<Dtype>*>& bottom,
+  const vector<Blob<Dtype>*>& top) {
+  Dtype margin = this->layer_param_.triplet_loss_param().margin();
+  Dtype losstype = this->layer_param_.triplet_loss_param().losstype();
+  int num_negatives = this->layer_param_.triplet_loss_param().num_negatives();
+  int use_pair = this->layer_param_.triplet_loss_param().use_pair();
+  CHECK_EQ(bottom[0]->num()%(2 + num_negatives), 0);
+  Dtype loss(0.0);
+  int dim = bottom[0]->count()/bottom[0]->num();
+  int num_set = bottom[0]->num()/(2 + num_negatives);
+  if (losstype == 0) {
+    for (int i = 0; i < num_set; ++i) {
+      caffe_sub(
+                dim,
+                bottom[0]->cpu_data() +
+                (2 + num_negatives)*i*dim,  // reference
+                bottom[0]->cpu_data() +
+                ((2 + num_negatives)*i + 1)*dim,  // positive
+                diff_pos.mutable_cpu_data() + i*dim);  // reference-pose_close
+      // Loss component calculated from reference and close one
+      dist_sq_pos.mutable_cpu_data()[i] =
+      caffe_cpu_dot(dim,
+                    diff_pos.cpu_data() + i*dim,
+                    diff_pos.cpu_data() + i*dim);
+      // a b is a similar pair for pair wise
+      // loss accumulated by the pair wise part
+      if (use_pair == 1) {
+        loss += dist_sq_pos.cpu_data()[i];
+      }
+      for (int triplet = 0; triplet < num_negatives; ++triplet) {
+        // Triplet loss accumulation
+        // a and negative[triplet] is a similar pair for triplet
+        dist_sq_.mutable_cpu_data()[i] = dist_sq_pos.cpu_data()[i];
+        // Loss component calculated from negative part
+        caffe_sub(
+                  dim,
+                  bottom[0]->cpu_data() +
+                  (2 + num_negatives)*i*dim,  // reference
+                  bottom[0]->cpu_data() +
+                  ((2 + num_negatives)*i + 2 + triplet)*dim,
+                  diff_neg.mutable_cpu_data() + i*dim);  // reference-negative
+        dist_sq_neg.mutable_cpu_data()[i] =
+        caffe_cpu_dot(dim,
+                      diff_neg.cpu_data() + i*dim,
+                      diff_neg.cpu_data() + i*dim);
+        // a and negative[triplet] is a dissimilar pair for triplet
+        dist_sq_.mutable_cpu_data()[i] -= dist_sq_neg.cpu_data()[i];
+        // loss accumulated accumulated by the triplet part
+        loss += std::max(margin + dist_sq_.cpu_data()[i], Dtype(0.0));
+      }
+    }
+    loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+    top[0]->mutable_cpu_data()[0] = loss;
+  } else if (losstype == 1) {
+    for (int i = 0; i < num_set; ++i) {
+      caffe_sub(
+                dim,
+                bottom[0]->cpu_data() +
+                (2 + num_negatives)*i*dim,  // reference
+                bottom[0]->cpu_data() +
+                ((2 + num_negatives)*i + 1)*dim,  // positive
+                diff_pos.mutable_cpu_data() + i*dim);  // reference-pose_close
+      // Loss component calculated from reference and close one
+      dist_sq_pos.mutable_cpu_data()[i] =
+      caffe_cpu_dot(dim,
+                    diff_pos.cpu_data() + i*dim,
+                    diff_pos.cpu_data() + i*dim);
+      // a b is a similar pair for pair wise
+      // loss accumulated by the pair wise part
+      if (use_pair == 1) {
+        loss += dist_sq_pos.cpu_data()[i];
+      }
+      for (int triplet = 0; triplet < num_negatives; ++triplet) {
+        dist_sq_.mutable_cpu_data()[i] = dist_sq_pos.cpu_data()[i];
+        dist_sq_.mutable_cpu_data()[i] += margin;
+        // Loss component calculated from negative part
+        caffe_sub(
+                  dim,
+                  bottom[0]->cpu_data() +
+                  (2 + num_negatives)*i*dim,  // reference
+                  bottom[0]->cpu_data() +
+                  ((2 + num_negatives)*i + 2 + triplet)*dim,
+                  diff_neg.mutable_cpu_data() + i*dim);  // reference-negative
+        dist_sq_neg.mutable_cpu_data()[i] =
+        caffe_cpu_dot(dim,
+                      diff_neg.cpu_data() + i*dim,
+                      diff_neg.cpu_data() + i*dim);
+        // a and negative[triplet] is a dissimilar pair for triplet
+        dist_sq_.mutable_cpu_data()[i] = 1 - \
+        dist_sq_neg.cpu_data()[i] / dist_sq_.cpu_data()[i];
+        // loss accumulated accumulated by the triplet part
+        loss += std::max(dist_sq_.cpu_data()[i], Dtype(0.0));
+      }
+    }
+    loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+    top[0]->mutable_cpu_data()[0] = loss;
+  } else if (losstype == 2) {
+  for (int i = 0; i < num_set; ++i) {
+    caffe_sub(
+        dim,
+        bottom[0]->cpu_data() +
+        (2 + num_negatives)*i*dim,  // reference
+        bottom[0]->cpu_data() +
+        ((2 + num_negatives)*i + 1)*dim,  // positive
+        diff_pos.mutable_cpu_data() + i*dim);  // reference-pose_close
+    // Loss component calculated from reference and close one
+    dist_sq_pos.mutable_cpu_data()[i] =
+    caffe_cpu_dot(dim,
+          diff_pos.cpu_data() + i*dim,
+          diff_pos.cpu_data() + i*dim);
+    // a b is a similar pair for pair wise
+    // loss accumulated by the pair wise part
+    if (use_pair == 1) {
+    loss += dist_sq_pos.cpu_data()[i];
+    }
+    for (int triplet = 0; triplet < num_negatives; ++triplet) {
+    dist_sq_.mutable_cpu_data()[i] = exp(dist_sq_pos.cpu_data()[i]);
+    dist_sq_.mutable_cpu_data()[i] += margin;
+    // Loss component calculated from negative part
+    caffe_sub(
+          dim,
+          bottom[0]->cpu_data() +
+          (2 + num_negatives)*i*dim,  // reference
+          bottom[0]->cpu_data() +
+          ((2 + num_negatives)*i + 2 + triplet)*dim,
+          diff_neg.mutable_cpu_data() + i*dim);  // reference-negative
+    dist_sq_neg.mutable_cpu_data()[i] =
+    caffe_cpu_dot(dim,
+            diff_neg.cpu_data() + i*dim,
+            diff_neg.cpu_data() + i*dim);
+    // a and negative[triplet] is a dissimilar pair for triplet
+    dist_sq_.mutable_cpu_data()[i] = 1 - \
+    exp(dist_sq_neg.cpu_data()[i]) / dist_sq_.cpu_data()[i];
+    // loss accumulated accumulated by the triplet part
+    loss += std::max(dist_sq_.cpu_data()[i], Dtype(0.0));
+    }
+  }
+  loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
+  }
+}
+
+template <typename Dtype>
+void TripletLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+                                           const vector<bool>& propagate_down,
+                                           const vector<Blob<Dtype>*>& bottom) {
+  Dtype margin = this->layer_param_.triplet_loss_param().margin();
+  Dtype losstype = this->layer_param_.triplet_loss_param().losstype();
+  int num_negatives = this->layer_param_.triplet_loss_param().num_negatives();
+  int use_pair = this->layer_param_.triplet_loss_param().use_pair();
+  int dim = bottom[0]->count()/bottom[0]->num();
+  int num_set = bottom[0]->num()/(2 + num_negatives);
+  if (losstype == 0) {
+    // BP for feat1(extracted from reference)
+    for (int i = 0; i < 1; ++i) {
+        if (propagate_down[0]) {
+          const Dtype sign = 1;
+          const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+          for (int j = 0; j < num_set; ++j) {
+            Dtype* bout = bottom[0]->mutable_cpu_diff();
+            // the pair part
+            if (use_pair == 1) {
+              caffe_cpu_axpby(
+          dim,
+          alpha,
+          diff_pos.cpu_data() + (j*dim),
+          Dtype(0.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+            } else {
+              caffe_cpu_axpby(
+          dim,
+          Dtype(0.0),
+          diff_pos.cpu_data() + (j*dim),
+          Dtype(0.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+            }
+            // the num_negatives triplet part
+            for (int triplet = 0; triplet < num_negatives; ++triplet) {
+              caffe_sub(
+                        dim,
+                        bottom[0]->cpu_data() +
+                        (2 + num_negatives)*j*dim,  // reference
+                        bottom[0]->cpu_data() +
+                        ((2 + num_negatives)*j + 2 + triplet)*dim,
+                        diff_neg.mutable_cpu_data() +
+                        j*dim);  // reference-negative
+              // Triplet loss accumulation
+              // a and negative[triplet] is a similar pair for triplet
+              dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+              dist_sq_neg.mutable_cpu_data()[j] =
+              caffe_cpu_dot(dim,
+                            diff_neg.cpu_data() + j*dim,
+                            diff_neg.cpu_data() + j*dim);
+              // a and negative[triplet] is a dissimilar pair for triplet
+              dist_sq_.mutable_cpu_data()[j] -= dist_sq_neg.cpu_data()[j];
+              // Loss component calculated from negative part
+              if ((margin + dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+                // similar pair in triplet
+                caffe_cpu_axpby(
+          dim,
+          alpha,
+          diff_pos.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + (2 + num_negatives)*j*dim);
+                // dissimilar pair in triplet
+                caffe_cpu_axpby(
+          dim,
+          -alpha,
+          diff_neg.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+            }
+          }
+        }
+      }
+    }
+    // BP for feat2(extracted from the closest sample)
+    for (int i = 1; i < 2; ++i) {
+      if (propagate_down[0]) {
+        const Dtype sign = -1;
+        const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+        static_cast<Dtype>(num_set);
+        for (int j = 0; j < num_set; ++j) {
+          Dtype* bout = bottom[0]->mutable_cpu_diff();
+          // the pair part
+          if (use_pair == 1) {
+            caffe_cpu_axpby(
+        dim,
+        alpha,
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+          } else {
+            caffe_cpu_axpby(
+        dim,
+        Dtype(0.0),
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+          }
+          // the num_negatives triplet part
+          for (int triplet = 0; triplet < num_negatives; ++triplet) {
+            caffe_sub(
+                      dim,
+                      bottom[0]->cpu_data() +
+                      (2 + num_negatives)*j*dim,  // reference
+                      bottom[0]->cpu_data() +
+                      ((2 + num_negatives)*j + 2 + triplet)*dim,
+                      diff_neg.mutable_cpu_data() +
+                      j*dim);  // reference-negative
+            // Triplet loss accumulation
+            // a and negative[triplet] is a similar pair for triplet
+            dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+            dist_sq_neg.mutable_cpu_data()[j] =
+            caffe_cpu_dot(dim,
+                          diff_neg.cpu_data() + j*dim,
+                          diff_neg.cpu_data() + j*dim);
+            // a and negative[triplet] is a dissimilar pair for triplet
+            dist_sq_.mutable_cpu_data()[j] -= dist_sq_neg.cpu_data()[j];
+            if ((margin + dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+              // similar pair in triplet
+              caffe_cpu_axpby(
+          dim,
+          alpha,
+          diff_pos.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+            }
+          }
+        }
+      }
+    }
+    // BP for negative feature used in the num_negatives triplet part
+    for (int i = 2; i < 2 + num_negatives; ++i) {
+      if (propagate_down[0]) {
+        const Dtype sign = 1;
+        const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+        static_cast<Dtype>(num_set);
+        for (int j = 0; j < num_set; ++j) {
+          Dtype* bout = bottom[0]->mutable_cpu_diff();
+          caffe_sub(
+                    dim,
+                    bottom[0]->cpu_data() +
+                    (2 + num_negatives)*j*dim,  // reference
+                    bottom[0]->cpu_data() +
+                    ((2 + num_negatives)*j + i)*dim,
+                    diff_neg.mutable_cpu_data() + j*dim);  // reference-negative
+          // Triplet loss accumulation
+          // a and negative[triplet] is a similar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+          dist_sq_neg.mutable_cpu_data()[j] =
+          caffe_cpu_dot(dim,
+                        diff_neg.cpu_data() + j*dim,
+                        diff_neg.cpu_data() + j*dim);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] -= dist_sq_neg.cpu_data()[j];
+          if ((margin + dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            // dissimilar pairs
+            caffe_cpu_axpby(
+        dim,
+        alpha,
+        diff_neg.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+          } else {
+            caffe_set(dim, Dtype(0), bout + ((2 + num_negatives)*j + i)*dim);
+          }
+        }
+      }
+    }
+  } else if (losstype == 1) {
+    for (int i = 0; i < 1; ++i) {
+      // BP for data1(feat1)
+      if (propagate_down[0]) {
+        const Dtype sign = 1;
+        const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+        static_cast<Dtype>(num_set);
+        for (int j = 0; j < num_set; ++j) {
+          Dtype* bout = bottom[0]->mutable_cpu_diff();
+          // the pair part
+          if (use_pair == 1) {
+            caffe_cpu_axpby(
+        dim,
+        alpha,
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+          } else {
+            caffe_cpu_axpby(
+        dim,
+        Dtype(0.0),
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+          }
+          // the num_negatives triplet part
+          for (int triplet = 0; triplet < num_negatives; ++triplet) {
+            dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.mutable_cpu_data()[j];
+            dist_sq_.mutable_cpu_data()[j] += margin;
+            // Loss component calculated from negative part
+            caffe_sub(
+                      dim,
+                      bottom[0]->cpu_data() +
+                      (2 + num_negatives)*j*dim,  // reference
+                      bottom[0]->cpu_data() +
+                      ((2 + num_negatives)*j + 2 + triplet)*dim,
+                      diff_neg.mutable_cpu_data() +
+                      j*dim);  // reference-negative
+            dist_sq_neg.mutable_cpu_data()[j] =
+            caffe_cpu_dot(dim,
+                          diff_neg.cpu_data() + j*dim,
+                          diff_neg.cpu_data() + j*dim);
+            // a and negative[triplet] is a dissimilar pair for triplet
+            dist_sq_.mutable_cpu_data()[j] = 1 - \
+            dist_sq_neg.cpu_data()[j] / dist_sq_.cpu_data()[j];
+            // loss accumulated accumulated by the triplet part
+            if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+              caffe_cpu_axpby(
+          dim,
+          alpha*dist_sq_neg.cpu_data()[j]/
+          ((dist_sq_pos.cpu_data()[j]+margin)*
+           (dist_sq_pos.cpu_data()[j]+margin)),
+          diff_pos.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+              caffe_cpu_axpby(
+          dim,
+          -alpha/(dist_sq_pos.mutable_cpu_data()[j]+margin),
+          diff_neg.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+            }
+          }
+        }
+      }
+    }
+    for (int i = 1; i < 2; ++i) {
+      // BP for positive data(feat2)
+      if (propagate_down[0]) {
+        const Dtype sign = -1;
+        const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+        static_cast<Dtype>(num_set);
+        for (int j = 0; j < num_set; ++j) {
+          Dtype* bout = bottom[0]->mutable_cpu_diff();
+          // the pair part
+          if (use_pair == 1) {
+            caffe_cpu_axpby(
+        dim,
+        alpha,
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+          } else {
+            caffe_cpu_axpby(
+        dim,
+        Dtype(0.0),
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+          }
+          // the num_negatives triplet part
+          for (int triplet = 0; triplet < num_negatives; ++triplet) {
+            dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+            dist_sq_.mutable_cpu_data()[j] += margin;
+            // Loss component calculated from negative part
+            caffe_sub(
+                      dim,
+                      bottom[0]->cpu_data() +
+                      (2 + num_negatives)*j*dim,  // reference
+                      bottom[0]->cpu_data() +
+                      ((2 + num_negatives)*j + 2 + triplet)*dim,
+                      diff_neg.mutable_cpu_data() +
+                      j*dim);  // reference-negative
+            dist_sq_neg.mutable_cpu_data()[j] =
+            caffe_cpu_dot(dim,
+                          diff_neg.cpu_data() + j*dim,
+                          diff_neg.cpu_data() + j*dim);
+            // a and negative[triplet] is a dissimilar pair for triplet
+            dist_sq_.mutable_cpu_data()[j] = 1 - \
+            dist_sq_neg.cpu_data()[j] / dist_sq_.cpu_data()[j];
+            // loss accumulated accumulated by the triplet part
+            if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+              caffe_cpu_axpby(
+          dim,
+          alpha*dist_sq_neg.cpu_data()[j]/
+            ((dist_sq_pos.cpu_data()[j]+margin)*
+           (dist_sq_pos.cpu_data()[j]+margin)),
+          diff_pos.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+            }
+          }
+        }
+      }
+    }
+    for (int i = 2; i < 2 + num_negatives; ++i) {
+      // BP for negative data(feat3)
+      if (propagate_down[0]) {
+        const Dtype sign = 1;
+        const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+        static_cast<Dtype>(num_set);
+        for (int j = 0; j < num_set; ++j) {
+          Dtype* bout = bottom[0]->mutable_cpu_diff();
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+          dist_sq_.mutable_cpu_data()[j] += margin;
+          // Loss component calculated from negative part
+          caffe_sub(
+                    dim,
+                    bottom[0]->cpu_data() + (2 + num_negatives)*j*dim,  // ref
+                    bottom[0]->cpu_data() + ((2 + num_negatives)*j + i)*dim,
+                    diff_neg.mutable_cpu_data() + j*dim);  // ref-negative
+          dist_sq_neg.mutable_cpu_data()[j] =
+          caffe_cpu_dot(dim,
+                        diff_neg.cpu_data() + j*dim,
+                        diff_neg.cpu_data() + j*dim);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = 1 - \
+          dist_sq_neg.cpu_data()[j] / dist_sq_.cpu_data()[j];
+          // loss accumulated accumulated by the triplet part
+          if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            caffe_cpu_axpby(
+        dim,
+        alpha/(dist_sq_pos.cpu_data()[j] + margin),
+        diff_neg.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+          } else {
+            caffe_set(dim, Dtype(0), bout + ((2 + num_negatives)*j + i)*dim);
+          }
+        }
+      }
+    }
+  } else if (losstype == 2) {
+  for (int i = 0; i < 1; ++i) {
+    // BP for data1(feat1)
+    if (propagate_down[0]) {
+    const Dtype sign = 1;
+    const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+    static_cast<Dtype>(num_set);
+    for (int j = 0; j < num_set; ++j) {
+      Dtype* bout = bottom[0]->mutable_cpu_diff();
+      // the pair part
+      if (use_pair == 1) {
+      caffe_cpu_axpby(
+        dim,
+        alpha,
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      } else {
+      caffe_cpu_axpby(
+        dim,
+        Dtype(0.0),
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      }
+      // the num_negatives triplet part
+      for (int triplet = 0; triplet < num_negatives; ++triplet) {
+      dist_sq_.mutable_cpu_data()[j] =
+        exp(dist_sq_pos.cpu_data()[j]);
+      dist_sq_.mutable_cpu_data()[j] += margin;
+      // Loss component calculated from negative part
+      caffe_sub(
+        dim,
+        bottom[0]->cpu_data()+(2 + num_negatives)*j*dim,  // reference
+        bottom[0]->cpu_data()+((2 + num_negatives)*j + 2 + triplet)*dim,
+          diff_neg.mutable_cpu_data() + j*dim);  // reference-negative
+      dist_sq_neg.mutable_cpu_data()[j] =
+      caffe_cpu_dot(dim,
+        diff_neg.cpu_data() + j*dim,
+        diff_neg.cpu_data() + j*dim);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[j] = 1 - \
+      exp(dist_sq_neg.cpu_data()[j]) / dist_sq_.cpu_data()[j];
+      // loss accumulated accumulated by the triplet part
+      if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+        caffe_cpu_axpby(
+          dim,
+          alpha*
+          Dtype(exp(dist_sq_neg.cpu_data()[j]))*
+          Dtype(exp(dist_sq_pos.cpu_data()[j]))/
+            (Dtype((exp(dist_sq_pos.cpu_data()[j]))+margin)*
+              (Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin)),
+          diff_pos.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+        caffe_cpu_axpby(
+          dim,
+          -alpha*
+          Dtype(exp(dist_sq_neg.cpu_data()[j]))/
+          (Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin),
+          diff_neg.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+      }
+      }
+    }
+    }
+  }
+  for (int i = 1; i < 2; ++i) {
+    // BP for positive data(feat2)
+    if (propagate_down[0]) {
+    const Dtype sign = -1;
+    const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+    static_cast<Dtype>(num_set);
+    for (int j = 0; j < num_set; ++j) {
+      Dtype* bout = bottom[0]->mutable_cpu_diff();
+      // the pair part
+      if (use_pair == 1) {
+      caffe_cpu_axpby(
+        dim,
+        alpha,
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      } else {
+      caffe_cpu_axpby(
+        dim,
+        Dtype(0.0),
+        diff_pos.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      }
+      // the num_negatives triplet part
+      for (int triplet = 0; triplet < num_negatives; ++triplet) {
+      dist_sq_.mutable_cpu_data()[j] =
+        exp(dist_sq_pos.cpu_data()[j]);
+      dist_sq_.mutable_cpu_data()[j] += margin;
+      // Loss component calculated from negative part
+      caffe_sub(
+        dim,
+          bottom[0]->cpu_data()+(2+num_negatives)*j*dim,  // reference
+          bottom[0]->cpu_data()+((2+num_negatives)*j+2+triplet)*dim,
+        diff_neg.mutable_cpu_data()+j*dim);  // reference-negative
+      dist_sq_neg.mutable_cpu_data()[j] =
+      caffe_cpu_dot(dim,
+        diff_neg.cpu_data() + j*dim,
+          diff_neg.cpu_data() + j*dim);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[j] = 1 - \
+      exp(dist_sq_neg.cpu_data()[j]) / dist_sq_.cpu_data()[j];
+      // loss accumulated accumulated by the triplet part
+      if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+        caffe_cpu_axpby(
+          dim,
+          alpha*
+          Dtype(exp(dist_sq_neg.cpu_data()[j]))*
+          Dtype(exp(dist_sq_pos.cpu_data()[j]))/
+            ((Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin)*
+            (Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin)),
+          diff_pos.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+      }
+      }
+    }
+    }
+  }
+  for (int i = 2; i < 2 + num_negatives; ++i) {
+    // BP for negative data(feat3)
+    if (propagate_down[0]) {
+    const Dtype sign = 1;
+    const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+    static_cast<Dtype>(num_set);
+    for (int j = 0; j < num_set; ++j) {
+      Dtype* bout = bottom[0]->mutable_cpu_diff();
+      dist_sq_.mutable_cpu_data()[j] =
+        exp(dist_sq_pos.cpu_data()[j]);
+      dist_sq_.mutable_cpu_data()[j] += margin;
+      // Loss component calculated from negative part
+      caffe_sub(
+        dim,
+        bottom[0]->cpu_data() + (2 + num_negatives)*j*dim,  // ref
+        bottom[0]->cpu_data() + ((2 + num_negatives)*j + i)*dim,
+        diff_neg.mutable_cpu_data() + j*dim);  // ref-negative
+      dist_sq_neg.mutable_cpu_data()[j] =
+      caffe_cpu_dot(dim,
+            diff_neg.cpu_data() + j*dim,
+            diff_neg.cpu_data() + j*dim);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[j] = 1 - \
+      exp(dist_sq_neg.cpu_data()[j]) / dist_sq_.mutable_cpu_data()[j];
+      // loss accumulated accumulated by the triplet part
+      if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+      caffe_cpu_axpby(
+        dim,
+        alpha*Dtype(exp(dist_sq_neg.cpu_data()[j]))/
+          (Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin),
+        diff_neg.cpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      } else {
+      caffe_set(dim, Dtype(0), bout + ((2 + num_negatives)*j + i)*dim);
+      }
+    }
+    }
+  }
+  }
+}
+#ifdef CPU_ONLY
+    STUB_GPU(TripletLossLayer);
+#endif
+    INSTANTIATE_CLASS(TripletLossLayer);
+    REGISTER_LAYER_CLASS(TripletLoss);
+}  // namespace caffe
diff --git a/src/caffe/layers/triplet_loss_layer.cu b/src/caffe/layers/triplet_loss_layer.cu
new file mode 100755
index 00000000000..5be91479ac3
--- /dev/null
+++ b/src/caffe/layers/triplet_loss_layer.cu
@@ -0,0 +1,650 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/layers/triplet_loss_layer.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void TripletLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  Dtype margin = this->layer_param_.triplet_loss_param().margin();
+  Dtype losstype = this->layer_param_.triplet_loss_param().losstype();
+  int num_negatives = this->layer_param_.triplet_loss_param().num_negatives();
+  int use_pair = this->layer_param_.triplet_loss_param().use_pair();
+  CHECK_EQ(bottom[0]->num()%(2 + num_negatives), 0);
+  Dtype loss(0.0);
+  int dim = bottom[0]->count()/bottom[0]->num();
+  int num_set = bottom[0]->num()/(2 + num_negatives);
+  if (losstype == 0) {
+  for (int i = 0; i < num_set; ++i) {
+    caffe_gpu_sub(
+        dim,
+        bottom[0]->gpu_data() + (2 + num_negatives)*i*dim,  // reference
+        bottom[0]->gpu_data() + ((2 + num_negatives)*i + 1)*dim,  // positive
+        diff_pos.mutable_gpu_data() + i*dim);  // reference-pose_close
+    caffe_gpu_dot(
+        dim,
+        diff_pos.gpu_data() + i*dim,
+        diff_pos.gpu_data() + i*dim,
+        dist_sq_pos.mutable_cpu_data() + i);
+    // a b is a similar pair for pair wise
+    // loss accumulated by the pair wise part
+    if (use_pair == 1) {
+        loss += dist_sq_pos.cpu_data()[i];
+    }
+    for (int triplet = 0; triplet < num_negatives; ++triplet) {
+      // Triplet loss accumulation
+      // a and negative[triplet] is a similar pair for triplet
+      dist_sq_.mutable_cpu_data()[i] = dist_sq_pos.cpu_data()[i];
+      // Loss component calculated from negative part
+      caffe_gpu_sub(
+          dim,
+          bottom[0]->gpu_data() + (2 + num_negatives)*i*dim,  // reference
+          bottom[0]->gpu_data() + ((2 + num_negatives)*i + 2 + triplet)*dim,
+          diff_neg.mutable_gpu_data() + i*dim);  // reference-negative
+      caffe_gpu_dot(
+          dim,
+          diff_neg.gpu_data() + i*dim,
+          diff_neg.gpu_data() + i*dim,
+          dist_sq_neg.mutable_cpu_data() + i);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[i] -= dist_sq_neg.cpu_data()[i];
+      // loss accumulated accumulated by the triplet part
+      loss += std::max(margin + dist_sq_.cpu_data()[i], Dtype(0.0));
+    }
+  }
+  loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
+  } else if (losstype == 1) {
+  for (int i = 0; i < num_set; ++i) {
+    caffe_gpu_sub(
+        dim,
+        bottom[0]->gpu_data() + (2 + num_negatives)*i*dim,  // reference
+        bottom[0]->gpu_data() + ((2 + num_negatives)*i + 1)*dim,  // positive
+        diff_pos.mutable_gpu_data() + i*dim);  // reference-pose_close
+    // Loss component calculated from reference and close one
+    caffe_gpu_dot(
+        dim,
+        diff_pos.gpu_data() + i*dim,
+        diff_pos.gpu_data() + i*dim,
+        dist_sq_pos.mutable_cpu_data() + i);
+    // a b is a similar pair for pair wise
+    // loss accumulated by the pair wise part
+    if (use_pair == 1) {
+        loss += dist_sq_pos.cpu_data()[i];
+    }
+    for (int triplet = 0; triplet < num_negatives; ++triplet) {
+      dist_sq_.mutable_cpu_data()[i] = dist_sq_pos.mutable_cpu_data()[i];
+      dist_sq_.mutable_cpu_data()[i] += margin;
+      // Loss component calculated from negative part
+      caffe_gpu_sub(
+          dim,
+          bottom[0]->gpu_data() + (2 + num_negatives)*i*dim,  // reference
+          bottom[0]->gpu_data() + ((2 + num_negatives)*i + 2 + triplet)*dim,
+          diff_neg.mutable_gpu_data() + i*dim);  // reference-negative
+      caffe_gpu_dot(
+          dim,
+          diff_neg.gpu_data() + i*dim,
+          diff_neg.gpu_data() + i*dim,
+          dist_sq_neg.mutable_cpu_data() + i);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[i] = 1 - \
+        dist_sq_neg.cpu_data()[i] / dist_sq_.mutable_cpu_data()[i];
+      // loss accumulated accumulated by the triplet part
+      loss += std::max(dist_sq_.cpu_data()[i], Dtype(0.0));
+    }
+  }
+  loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
+  } else if (losstype == 2) {
+  for (int i = 0; i < num_set; ++i) {
+    caffe_gpu_sub(
+      dim,
+      bottom[0]->gpu_data() +
+      (2 + num_negatives)*i*dim,  // reference
+      bottom[0]->gpu_data() +
+      ((2 + num_negatives)*i + 1)*dim,  // positive
+      diff_pos.mutable_gpu_data() + i*dim);  // reference-pose_close
+    // Loss component calculated from reference and close one
+    caffe_gpu_dot(
+      dim,
+      diff_pos.gpu_data() + i*dim,
+      diff_pos.gpu_data() + i*dim,
+      dist_sq_pos.mutable_cpu_data() + i);
+    // a b is a similar pair for pair wise
+    // loss accumulated by the pair wise part
+    if (use_pair == 1) {
+    loss += dist_sq_pos.cpu_data()[i];
+    }
+    for (int triplet = 0; triplet < num_negatives; ++triplet) {
+    dist_sq_.mutable_cpu_data()[i] = exp(dist_sq_pos.mutable_cpu_data()[i]);
+    dist_sq_.mutable_cpu_data()[i] += margin;
+    // Loss component calculated from negative part
+    caffe_gpu_sub(
+      dim,
+      bottom[0]->gpu_data() +
+      (2 + num_negatives)*i*dim,  // reference
+      bottom[0]->gpu_data() +
+      ((2 + num_negatives)*i + 2 + triplet)*dim,
+      diff_neg.mutable_gpu_data() + i*dim);  // reference-negative
+    caffe_gpu_dot(
+      dim,
+      diff_neg.gpu_data() + i*dim,
+      diff_neg.gpu_data() + i*dim,
+      dist_sq_neg.mutable_cpu_data() + i);
+    // a and negative[triplet] is a dissimilar pair for triplet
+    dist_sq_.mutable_cpu_data()[i] = 1 - \
+    exp(dist_sq_neg.cpu_data()[i]) / dist_sq_.mutable_cpu_data()[i];
+    // loss accumulated accumulated by the triplet part
+    loss += std::max(dist_sq_.cpu_data()[i], Dtype(0.0));
+    }
+  }
+  loss = loss / static_cast<Dtype>(num_set) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
+  }
+}
+
+template <typename Dtype>
+void TripletLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  Dtype margin = this->layer_param_.triplet_loss_param().margin();
+  Dtype losstype = this->layer_param_.triplet_loss_param().losstype();
+  int num_negatives = this->layer_param_.triplet_loss_param().num_negatives();
+  int use_pair = this->layer_param_.triplet_loss_param().use_pair();
+  int dim = bottom[0]->count()/bottom[0]->num();
+  int num_set = bottom[0]->num()/(2 + num_negatives);
+  if (losstype == 0) {
+  // BP for feat1(extracted from reference)
+  for (int i = 0; i < 1; ++i) {
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+      // the pair part
+      if (use_pair == 1) {
+        caffe_gpu_axpby(
+            dim,
+            alpha,
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_negatives)*j + i)*dim);
+      } else {
+        caffe_gpu_axpby(
+            dim,
+            Dtype(0.0),
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_negatives)*j + i)*dim);
+      }
+        // the num_negatives triplet part
+        for (int triplet = 0; triplet < num_negatives; ++triplet) {
+          caffe_gpu_sub(
+              dim,
+              bottom[0]->gpu_data() + (2 + num_negatives)*j*dim,  // reference
+              bottom[0]->gpu_data() + ((2 + num_negatives)*j + 2 + triplet)*dim,
+              diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+          caffe_gpu_dot(
+              dim,
+              diff_neg.gpu_data() + j*dim,
+              diff_neg.gpu_data() + j*dim,
+              dist_sq_neg.mutable_cpu_data() + j);
+          // Triplet loss accumulation
+          // a and negative[triplet] is a similar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] -= dist_sq_neg.cpu_data()[j];
+          // Loss component calculated from negative part
+          if ((margin + dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            // similar pair in triplet
+            caffe_gpu_axpby(
+                dim,
+                alpha,
+                diff_pos.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + (2 + num_negatives)*j*dim);
+            // dissimilar pair in triplet
+            caffe_gpu_axpby(
+                dim,
+                -alpha,
+                diff_neg.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_negatives)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  // BP for feat2(extracted from the closest sample)
+  for (int i = 1; i < 2; ++i) {
+    if (propagate_down[0]) {
+      const Dtype sign = -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+        // the pair part
+        if (use_pair == 1) {
+        caffe_gpu_axpby(
+            dim,
+            alpha,
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_negatives)*j + i)*dim);
+        } else {
+            caffe_gpu_axpby(
+            dim,
+            Dtype(0.0),
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_negatives)*j + i)*dim);
+        }
+        // the num_negatives triplet part
+        for (int triplet = 0; triplet < num_negatives; ++triplet) {
+          caffe_gpu_sub(
+              dim,
+              bottom[0]->gpu_data() + (2 + num_negatives)*j*dim,  // reference
+              bottom[0]->gpu_data() + ((2 + num_negatives)*j + 2 + triplet)*dim,
+              diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+          // Triplet loss accumulation
+          // a and negative[triplet] is a similar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+          caffe_gpu_dot(
+              dim,
+              diff_neg.gpu_data() + j*dim,
+              diff_neg.gpu_data() + j*dim,
+              dist_sq_neg.mutable_cpu_data() + j);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] -= dist_sq_neg.cpu_data()[j];
+          if ((margin + dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            // similar pair in triplet
+            caffe_gpu_axpby(
+                dim,
+                alpha,
+                diff_pos.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_negatives)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  // BP for negative feature used in the num_negatives triplet part
+  for (int i = 2; i < 2 + num_negatives; ++i) {
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+        caffe_gpu_sub(
+            dim,
+            bottom[0]->gpu_data() + (2 + num_negatives)*j*dim,  // reference
+            bottom[0]->gpu_data() + ((2 + num_negatives)*j + i)*dim,
+            diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+        // Triplet loss accumulation
+        // a and negative[triplet] is a similar pair for triplet
+        dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+        caffe_gpu_dot(
+            dim,
+            diff_neg.gpu_data() + j*dim,
+            diff_neg.gpu_data() + j*dim,
+            dist_sq_neg.mutable_cpu_data() + j);
+        // a and negative[triplet] is a dissimilar pair for triplet
+        dist_sq_.mutable_cpu_data()[j] -= dist_sq_neg.cpu_data()[j];
+        if ((margin + dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+          // dissimilar pairs
+          caffe_gpu_axpby(
+              dim,
+              alpha,
+              diff_neg.gpu_data() + (j*dim),
+              Dtype(0.0),
+              bout + ((2 + num_negatives)*j + i)*dim);
+        } else {
+            caffe_gpu_set(dim, Dtype(0),
+                bout + ((2 + num_negatives)*j + i)*dim);
+        }
+      }
+    }
+  }
+  } else if (losstype == 1) {
+  for (int i = 0; i < 1; ++i) {
+    // BP for data1(feat1)
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+      // the pair part
+      if (use_pair == 1) {
+        caffe_gpu_axpby(
+            dim,
+            alpha,
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_negatives)*j + i)*dim);
+        } else {
+        caffe_gpu_axpby(
+            dim,
+            Dtype(0.0),
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_negatives)*j + i)*dim);
+        }
+        // the num_negatives triplet part
+        for (int triplet = 0; triplet < num_negatives; ++triplet) {
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+          dist_sq_.mutable_cpu_data()[j] += margin;
+          // Loss component calculated from negative part
+          caffe_gpu_sub(
+              dim,
+              bottom[0]->gpu_data() + (2 + num_negatives)*j*dim,  // reference
+              bottom[0]->gpu_data() + ((2 + num_negatives)*j + 2 + triplet)*dim,
+              diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+          caffe_gpu_dot(
+              dim,
+              diff_neg.gpu_data() + j*dim,
+              diff_neg.gpu_data() + j*dim,
+              dist_sq_neg.mutable_cpu_data() + j);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = 1 - \
+            dist_sq_neg.cpu_data()[j] / dist_sq_.cpu_data()[j];
+          // loss accumulated accumulated by the triplet part
+          if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            caffe_gpu_axpby(
+                dim,
+                alpha*dist_sq_neg.mutable_cpu_data()[j]/
+          ((dist_sq_pos.cpu_data()[j]+margin)*
+           (dist_sq_pos.cpu_data()[j]+margin)),
+                diff_pos.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_negatives)*j + i)*dim);
+            caffe_gpu_axpby(
+                dim,
+                -alpha/(dist_sq_pos.cpu_data()[j] + margin),
+                diff_neg.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_negatives)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  for (int i = 1; i < 2; ++i) {
+    // BP for positive data(feat2)
+    if (propagate_down[0]) {
+      const Dtype sign = -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+      // the pair part
+      if (use_pair == 1) {
+        caffe_gpu_axpby(
+            dim,
+            alpha,
+            diff_pos.gpu_data() + (j*dim),
+            Dtype(0.0),
+            bout + ((2 + num_negatives)*j + i)*dim);
+      } else {
+      caffe_gpu_axpby(
+          dim,
+          Dtype(0.0),
+          diff_pos.gpu_data() + (j*dim),
+          Dtype(0.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+      }
+        // the num_negatives triplet part
+        for (int triplet = 0; triplet < num_negatives; ++triplet) {
+          dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+          dist_sq_.mutable_cpu_data()[j] += margin;
+          // Loss component calculated from negative part
+          caffe_gpu_sub(
+              dim,
+              bottom[0]->gpu_data() + (2 + num_negatives)*j*dim,  // reference
+              bottom[0]->gpu_data() + ((2 + num_negatives)*j + 2 + triplet)*dim,
+              diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+          caffe_gpu_dot(
+              dim,
+              diff_neg.gpu_data() + j*dim,
+              diff_neg.gpu_data() + j*dim,
+              dist_sq_neg.mutable_cpu_data() + j);
+          // a and negative[triplet] is a dissimilar pair for triplet
+          dist_sq_.mutable_cpu_data()[j] = 1 - \
+            dist_sq_neg.cpu_data()[j] / dist_sq_.mutable_cpu_data()[j];
+          // loss accumulated accumulated by the triplet part
+          if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+            caffe_gpu_axpby(
+                dim,
+                alpha*dist_sq_neg.cpu_data()[j]/
+          ((dist_sq_pos.cpu_data()[j]+margin)*
+           (dist_sq_pos.cpu_data()[j]+margin)),
+                diff_pos.gpu_data() + (j*dim),
+                Dtype(1.0),
+                bout + ((2 + num_negatives)*j + i)*dim);
+          }
+        }
+      }
+    }
+  }
+  for (int i = 2; i < 2 + num_negatives; ++i) {
+    // BP for negative data(feat3)
+    if (propagate_down[0]) {
+      const Dtype sign = 1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(num_set);
+      for (int j = 0; j < num_set; ++j) {
+        Dtype* bout = bottom[0]->mutable_gpu_diff();
+        dist_sq_.mutable_cpu_data()[j] = dist_sq_pos.cpu_data()[j];
+        dist_sq_.mutable_cpu_data()[j] += margin;
+        // Loss component calculated from negative part
+        caffe_gpu_sub(
+            dim,
+            bottom[0]->gpu_data() + (2 + num_negatives)*j*dim,  // reference
+            bottom[0]->gpu_data() + ((2 + num_negatives)*j + i)*dim,
+            diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+        caffe_gpu_dot(
+            dim,
+            diff_neg.gpu_data() + j*dim,
+            diff_neg.gpu_data() + j*dim,
+            dist_sq_neg.mutable_cpu_data() + j);
+        // a and negative[triplet] is a dissimilar pair for triplet
+        dist_sq_.mutable_cpu_data()[j] = 1 - \
+          dist_sq_neg.cpu_data()[j] / dist_sq_.cpu_data()[j];
+        // loss accumulated accumulated by the triplet part
+        if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+          caffe_gpu_axpby(
+              dim,
+              alpha/(dist_sq_pos.cpu_data()[j] + margin),
+              diff_neg.gpu_data() + (j*dim),
+              Dtype(0.0),
+              bout + ((2 + num_negatives)*j + i)*dim);
+        } else {
+            caffe_gpu_set(dim, Dtype(0),
+                bout + ((2 + num_negatives)*j + i)*dim);
+        }
+      }
+    }
+  }
+  } else if (losstype == 2) {
+  for (int i = 0; i < 1; ++i) {
+    // BP for data1(feat1)
+    if (propagate_down[0]) {
+    const Dtype sign = 1;
+    const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+    static_cast<Dtype>(num_set);
+    for (int j = 0; j < num_set; ++j) {
+      Dtype* bout = bottom[0]->mutable_cpu_diff();
+      // the pair part
+      if (use_pair == 1) {
+      caffe_gpu_axpby(
+        dim,
+        alpha,
+        diff_pos.gpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      } else {
+      caffe_gpu_axpby(
+        dim,
+          Dtype(0.0),
+        diff_pos.gpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      }
+      // the num_negatives triplet part
+      for (int triplet = 0; triplet < num_negatives; ++triplet) {
+      dist_sq_.mutable_cpu_data()[j] =
+      exp(dist_sq_pos.mutable_cpu_data()[j]);
+      dist_sq_.mutable_cpu_data()[j] += margin;
+      // Loss component calculated from negative part
+      caffe_gpu_sub(
+        dim,
+        bottom[0]->gpu_data()+(2 + num_negatives)*j*dim,  // reference
+        bottom[0]->gpu_data()+((2 + num_negatives)*j + 2 + triplet)*dim,
+        diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+      caffe_gpu_dot(
+        dim,
+        diff_neg.gpu_data()+j*dim,
+        diff_neg.gpu_data()+j*dim,
+        dist_sq_neg.mutable_cpu_data() + j);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[j] = 1 - \
+      exp(dist_sq_neg.cpu_data()[j]) / dist_sq_.cpu_data()[j];
+      // loss accumulated accumulated by the triplet part
+      if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+        caffe_gpu_axpby(
+          dim,
+          alpha*
+            Dtype(exp(dist_sq_neg.cpu_data()[j]))*
+            Dtype(exp(dist_sq_pos.cpu_data()[j]))/
+            (Dtype((exp(dist_sq_pos.cpu_data()[j]))+margin)*
+            (Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin)),
+          diff_pos.gpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+        caffe_cpu_axpby(
+          dim,
+          -alpha*
+          Dtype(exp(dist_sq_neg.cpu_data()[j]))/
+            (Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin),
+          diff_neg.cpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+      }
+      }
+    }
+    }
+  }
+  for (int i = 1; i < 2; ++i) {
+    // BP for positive data(feat2)
+    if (propagate_down[0]) {
+    const Dtype sign = -1;
+    const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+    static_cast<Dtype>(num_set);
+    for (int j = 0; j < num_set; ++j) {
+      Dtype* bout = bottom[0]->mutable_cpu_diff();
+      // the pair part
+      if (use_pair == 1) {
+      caffe_gpu_axpby(
+        dim,
+        alpha,
+        diff_pos.gpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      } else {
+      caffe_gpu_axpby(
+        dim,
+        Dtype(0.0),
+        diff_pos.gpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      }
+      // the num_negatives triplet part
+      for (int triplet = 0; triplet < num_negatives; ++triplet) {
+      dist_sq_.mutable_cpu_data()[j] =
+      exp(dist_sq_pos.cpu_data()[j]);
+      dist_sq_.mutable_cpu_data()[j] += margin;
+      // Loss component calculated from negative part
+      caffe_gpu_sub(
+        dim,
+        bottom[0]->gpu_data()+(2 + num_negatives)*j*dim,  // reference
+        bottom[0]->gpu_data()+((2 + num_negatives)*j + 2 + triplet)*dim,
+        diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+      caffe_gpu_dot(
+        dim,
+        diff_neg.gpu_data()+j*dim,
+        diff_neg.gpu_data()+j*dim,
+        dist_sq_neg.mutable_cpu_data() + j);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[j] = 1 - \
+      exp(dist_sq_neg.cpu_data()[j]) / dist_sq_.cpu_data()[j];
+      // loss accumulated accumulated by the triplet part
+      if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+        caffe_gpu_axpby(
+          dim,
+          alpha*
+          Dtype(exp(dist_sq_neg.cpu_data()[j]))*
+          Dtype(exp(dist_sq_pos.cpu_data()[j]))/
+            ((Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin)*
+            (Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin)),
+          diff_pos.gpu_data() + (j*dim),
+          Dtype(1.0),
+          bout + ((2 + num_negatives)*j + i)*dim);
+      }
+      }
+    }
+    }
+  }
+  for (int i = 2; i < 2 + num_negatives; ++i) {
+    // BP for negative data(feat3)
+    if (propagate_down[0]) {
+    const Dtype sign = 1;
+    const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+    static_cast<Dtype>(num_set);
+    for (int j = 0; j < num_set; ++j) {
+      Dtype* bout = bottom[0]->mutable_cpu_diff();
+      dist_sq_.mutable_cpu_data()[j] =
+      exp(dist_sq_pos.cpu_data()[j]);
+      dist_sq_.mutable_cpu_data()[j] += margin;
+      // Loss component calculated from negative part
+      caffe_gpu_sub(
+        dim,
+        bottom[0]->gpu_data()+(2 + num_negatives)*j*dim,  // reference
+        bottom[0]->gpu_data()+((2 + num_negatives)*j + i)*dim,
+        diff_neg.mutable_gpu_data() + j*dim);  // reference-negative
+      caffe_gpu_dot(
+        dim,
+        diff_neg.gpu_data()+j*dim,
+        diff_neg.gpu_data()+j*dim,
+        dist_sq_neg.mutable_cpu_data() + j);
+      // a and negative[triplet] is a dissimilar pair for triplet
+      dist_sq_.mutable_cpu_data()[j] = 1 - \
+      exp(dist_sq_neg.cpu_data()[j]) / dist_sq_.cpu_data()[j];
+      // loss accumulated accumulated by the triplet part
+      if ((dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+      caffe_gpu_axpby(
+        dim,
+        alpha*Dtype(exp(dist_sq_neg.cpu_data()[j]))/
+        (Dtype(exp(dist_sq_pos.cpu_data()[j]))+margin),
+        diff_neg.gpu_data() + (j*dim),
+        Dtype(0.0),
+        bout + ((2 + num_negatives)*j + i)*dim);
+      } else {
+      caffe_set(dim, Dtype(0), bout + ((2 + num_negatives)*j + i)*dim);
+      }
+    }
+    }
+  }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(TripletLossLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 0b2768b7708..aa2bc08f861 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -306,7 +306,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
+// LayerParameter next available layer-specific ID: 148 (last added: triplet_loss_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -403,6 +403,7 @@ message LayerParameter {
   optional ThresholdParameter threshold_param = 128;
   optional TileParameter tile_param = 138;
   optional WindowDataParameter window_data_param = 129;
+  optional TripletLossParameter triplet_loss_param = 147;
 }
 
 // Message that stores parameters used to apply transformation
@@ -555,6 +556,14 @@ message ContrastiveLossParameter {
   optional bool legacy_version = 2 [default = false];
 }
 
+message TripletLossParameter {
+  //margin for negative triplet
+  optional float margin = 1 [default = 1.0];
+  optional uint32 losstype = 2 [default = 1];
+  optional uint32 num_negatives = 3 [default = 3];
+  optional uint32 use_pair = 4 [default = 0];
+}
+
 message ConvolutionParameter {
   optional uint32 num_output = 1; // The number of outputs for the layer
   optional bool bias_term = 2 [default = true]; // whether to have bias terms
@@ -1294,6 +1303,7 @@ message V1LayerParameter {
   optional TransformationParameter transform_param = 36;
   optional LossParameter loss_param = 42;
   optional V0LayerParameter layer = 1;
+  optional TripletLossParameter triplet_loss_param = 43;
 }
 
 // DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
diff --git a/src/caffe/test/test_triplet_loss_layer.cpp b/src/caffe/test/test_triplet_loss_layer.cpp
new file mode 100644
index 00000000000..61304545de7
--- /dev/null
+++ b/src/caffe/test/test_triplet_loss_layer.cpp
@@ -0,0 +1,128 @@
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/triplet_loss_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class TripletLossLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  TripletLossLayerTest()
+      : blob_bottom_data_(new Blob<Dtype>(50, 1, 1, 1)),
+        blob_bottom_y_(new Blob<Dtype>(50, 1, 1, 1)),
+        blob_top_loss_(new Blob<Dtype>()) {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_min(-1.0);
+    filler_param.set_max(1.0);  // distances~=1.0 to test both sides of margin
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_data_);
+    blob_bottom_vec_.push_back(blob_bottom_data_);
+    for (int i = 0; i < blob_bottom_y_->count(); ++i) {
+      blob_bottom_y_->mutable_cpu_data()[i] = caffe_rng_rand() % 2;  // 0 or 1
+    }
+    blob_bottom_vec_.push_back(blob_bottom_y_);
+    blob_top_vec_.push_back(blob_top_loss_);
+  }
+  virtual ~TripletLossLayerTest() {
+    delete blob_bottom_data_;
+    delete blob_bottom_y_;
+    delete blob_top_loss_;
+  }
+
+  Blob<Dtype>* const blob_bottom_data_;
+  Blob<Dtype>* const blob_bottom_y_;
+  Blob<Dtype>* const blob_top_loss_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(TripletLossLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(TripletLossLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  TripletLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // manually compute to compare
+  const Dtype margin = layer_param.triplet_loss_param().margin();
+  const Dtype losstype = 0;  // layer_param.triplet_loss_param().losstype();
+  const int num_triplets = 3;
+  const int num_set = this->blob_bottom_data_->num()/(2 + num_triplets);
+  const int channels = this->blob_bottom_data_->channels();
+  Dtype loss(0);
+  const Dtype* cpu_data = this->blob_bottom_data_->cpu_data();
+  if (losstype == 0) {
+  for (int i = 0; i < num_set; ++i) {
+    Dtype dist_par(0);
+    for (int j = 0; j < channels; ++j) {
+      Dtype diff_pos = cpu_data[(2+num_triplets)*i*channels+j] -
+              cpu_data[((2+num_triplets)*i+1)*channels+j];
+      dist_par = diff_pos*diff_pos;
+      loss += dist_par;
+    }
+    for (int triplet = 0; triplet < num_triplets; ++triplet) {
+      Dtype dist_sq(0);
+      for (int j = 0; j < channels; ++j) {
+        Dtype diff_pos = cpu_data[(2+num_triplets)*i*channels+j] -
+                cpu_data[((2+num_triplets)*i+1)*channels+j];
+        dist_sq += diff_pos*diff_pos;
+        Dtype diff_neg = cpu_data[(2+num_triplets)*i*channels+j] -
+                cpu_data[((2+num_triplets)*i+2+triplet)*channels+j];
+        dist_sq -= diff_neg*diff_neg;
+      }
+      loss += std::max(margin + dist_sq, Dtype(0.0));
+    }
+  }
+  }
+//  else
+//  {
+//  for (int i = 0; i < num; ++i) {
+//    Dtype dist_sq(0);
+//    Dtype dist_par(0);
+//    for (int j = 0; j < channels; ++j) {
+//      Dtype diff_pos = this->blob_bottom_data_i_->cpu_data()[i*channels+j] -
+//          this->blob_bottom_data_j_->cpu_data()[i*channels+j];
+//      dist_sq += diff_pos*diff_pos;
+//      dist_sq += margin;
+//      Dtype diff_neg = this->blob_bottom_data_i_->cpu_data()[i*channels+j] -
+//          this->blob_bottom_data_k_->cpu_data()[i*channels+j];
+//      dist_sq = 1 - diff_neg*diff_neg/dist_sq;
+//      Dtype diff_par = this->blob_bottom_data_l_->cpu_data()[i*channels+j] -
+//          this->blob_bottom_data_m_->cpu_data()[i*channels+j];
+//      dist_par = diff_par*diff_par;
+//    }
+//    loss += std::max(dist_sq, Dtype(0.0));
+//    loss += dist_par;
+//  }
+//  }
+  loss /= static_cast<Dtype>(num_set) * Dtype(2);
+  // EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0], loss, 1e-6);
+}
+
+TYPED_TEST(TripletLossLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  TripletLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
+  // check the gradient for the first 5 bottom layers
+  // checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+  //    this->blob_top_vec_, 0);
+}
+}  // namespace caffe