phi-dbq
diff --git a/‎python/docs/sparkdl.rst‎
Lines changed: 2 additions & 0 deletions b/‎python/docs/sparkdl.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/sparkdl/__init__.py‎
Lines changed: 5 additions & 3 deletions b/‎python/sparkdl/__init__.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎python/sparkdl/graph/builder.py‎
Lines changed: 4 additions & 3 deletions b/‎python/sparkdl/graph/builder.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎python/sparkdl/graph/input.py‎
Lines changed: 355 additions & 0 deletions b/‎python/sparkdl/graph/input.py‎
Lines changed: 355 additions & 0 deletions
diff --git a/‎python/sparkdl/param/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎python/sparkdl/param/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/sparkdl/param/converters.py‎
Lines changed: 16 additions & 1 deletion b/‎python/sparkdl/param/converters.py‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎python/sparkdl/param/shared_params.py‎
Lines changed: 28 additions & 4 deletions b/‎python/sparkdl/param/shared_params.py‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎python/sparkdl/transformers/tf_tensor.py‎
Lines changed: 105 additions & 0 deletions b/‎python/sparkdl/transformers/tf_tensor.py‎
Lines changed: 105 additions & 0 deletions
@@ -6,8 +6,10 @@ Subpackages
 
 .. toctree::
 
+    sparkdl.estimators
     sparkdl.graph
     sparkdl.image
+    sparkdl.param
     sparkdl.transformers
     sparkdl.udf
     sparkdl.utils
 
@@ -13,15 +13,17 @@
 # limitations under the License.
 #
 
+from .graph.input import TFInputGraph
 from .image.imageIO import imageSchema, imageType, readImages
 from .transformers.keras_image import KerasImageFileTransformer
 from .transformers.named_image import DeepImagePredictor, DeepImageFeaturizer
 from .transformers.tf_image import TFImageTransformer
+from .transformers.tf_tensor import TFTransformer
 from .transformers.utils import imageInputPlaceholder
 
+
 __all__ = [
     'imageSchema', 'imageType', 'readImages',
-    'TFImageTransformer',
-    'DeepImagePredictor', 'DeepImageFeaturizer',
-    'KerasImageFileTransformer',
+    'TFImageTransformer', 'TFInputGraph', 'TFTransformer',
+    'DeepImagePredictor', 'DeepImageFeaturizer', 'KerasImageFileTransformer',
     'imageInputPlaceholder']
@@ -47,19 +47,20 @@ def __init__(self, graph=None, using_keras=False):
         self.graph = graph or tf.Graph()
         self.sess = tf.Session(graph=self.graph)
         if using_keras:
+            self.using_keras = True
             self.keras_prev_sess = K.get_session()
         else:
+            self.using_keras = False
             self.keras_prev_sess = None
 
     def __enter__(self):
-        self.sess.as_default()
         self.sess.__enter__()
-        if self.keras_prev_sess is not None:
+        if self.using_keras:
             K.set_session(self.sess)
         return self
 
     def __exit__(self, *args):
-        if self.keras_prev_sess is not None:
+        if self.using_keras:
             K.set_session(self.keras_prev_sess)
         self.sess.__exit__(*args)
 
 
@@ -16,7 +16,7 @@
 from sparkdl.param.shared_params import (
     keyword_only, HasInputCol, HasOutputCol, HasLabelCol,
     # TFTransformer Params
-    HasInputMapping, HasOutputMapping, HasTFHParams,
+    HasInputMapping, HasOutputMapping, HasTFInputGraph, HasTFHParams,
     # Keras Estimator Params
     HasKerasModel, HasKerasLoss, HasKerasOptimizer, HasOutputNodeName)
 from sparkdl.param.converters import SparkDLTypeConverters
 
@@ -30,6 +30,7 @@
 
 from pyspark.ml.param import TypeConverters
 
+from sparkdl.graph.input import *
 import sparkdl.utils.keras_model as kmutil
 
 __all__ = ['SparkDLTypeConverters']
@@ -52,6 +53,13 @@ def toTFGraph(value):
             raise TypeError("Could not convert %s to tf.Graph" % type(value))
         return value
 
+    @staticmethod
+    def toTFInputGraph(value):
+        if isinstance(value, TFInputGraph):
+            return value
+        else:
+            raise TypeError("Could not convert %s to TFInputGraph" % type(value))
+
     @staticmethod
     def asColumnToTensorNameMap(value):
         """
@@ -167,7 +175,14 @@ def _check_is_tensor_name(_maybe_tnsr_name):
         raise TypeError(err_msg.format(type(_maybe_tnsr_name)))
 
     # The check is taken from TensorFlow's NodeDef protocol buffer.
-    # https://github.com/tensorflow/tensorflow/blob/r1.3/tensorflow/core/framework/node_def.proto#L21-L25
+    #   Each input is "node:src_output" with "node" being a string name and
+    #   "src_output" indicating which output tensor to use from "node". If
+    #   "src_output" is 0 the ":0" suffix can be omitted.  Regular inputs
+    #   may optionally be followed by control inputs that have the format
+    #   "^node".
+    # Reference:
+    #    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/node_def.proto
+    #    https://stackoverflow.com/questions/36150834/how-does-tensorflow-name-tensors
     try:
         _, src_idx = _maybe_tnsr_name.split(":")
         _ = int(src_idx)
 
@@ -19,9 +19,11 @@
 """
 import textwrap
 from functools import wraps
+import six
 
 from pyspark.ml.param import Param, Params, TypeConverters
 
+from sparkdl.graph.input import TFInputGraph
 from sparkdl.param.converters import SparkDLTypeConverters
 
 ########################################################
@@ -196,8 +198,9 @@ class HasOutputMapping(Params):
     """
     Mixin for param outputMapping: ordered list of ('outputTensorOpName', 'outputColName') pairs
     """
-    outputMapping = Param(Params._dummy(), "outputMapping",
-                          "Mapping output :class:`tf.Operation` names to DataFrame column names",
+    outputMapping = Param(Params._dummy(),
+                          "outputMapping",
+                          "Mapping output :class:`tf.Tensor` names to DataFrame column names",
                           typeConverter=SparkDLTypeConverters.asTensorNameToColumnMap)
 
     def setOutputMapping(self, value):
@@ -211,8 +214,9 @@ class HasInputMapping(Params):
     """
     Mixin for param inputMapping: ordered list of ('inputColName', 'inputTensorOpName') pairs
     """
-    inputMapping = Param(Params._dummy(), "inputMapping",
-                         "Mapping input DataFrame column names to :class:`tf.Operation` names",
+    inputMapping = Param(Params._dummy(),
+                         "inputMapping",
+                         "Mapping input DataFrame column names to :class:`tf.Tensor` names",
                          typeConverter=SparkDLTypeConverters.asColumnToTensorNameMap)
 
     def setInputMapping(self, value):
@@ -222,6 +226,26 @@ def getInputMapping(self):
         return self.getOrDefault(self.inputMapping)
 
 
+class HasTFInputGraph(Params):
+    """
+    Mixin for param tfInputGraph: a serializable object derived from a TensorFlow computation graph.
+    """
+    tfInputGraph = Param(Params._dummy(),
+                         "tfInputGraph",
+                         "A serializable object derived from a TensorFlow computation graph",
+                         typeConverter=SparkDLTypeConverters.toTFInputGraph)
+
+    def __init__(self):
+        super(HasTFInputGraph, self).__init__()
+        self._setDefault(tfInputGraph=None)
+
+    def setTFInputGraph(self, value):
+        return self._set(tfInputGraph=value)
+
+    def getTFInputGraph(self):
+        return self.getOrDefault(self.tfInputGraph)
+
+
 class HasTFHParams(Params):
     """
     Mixin for TensorFlow model hyper-parameters
 
@@ -0,0 +1,105 @@
+# Copyright 2017 Databricks, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import absolute_import, division, print_function
+
+import logging
+import tensorflow as tf
+from tensorflow.python.tools import optimize_for_inference_lib as infr_opt
+import tensorframes as tfs
+
+from pyspark.ml import Transformer
+
+import sparkdl.graph.utils as tfx
+from sparkdl.param import (keyword_only, HasInputMapping, HasOutputMapping,
+                           HasTFInputGraph, HasTFHParams)
+
+__all__ = ['TFTransformer']
+
+logger = logging.getLogger('sparkdl')
+
+class TFTransformer(Transformer, HasTFInputGraph, HasTFHParams, HasInputMapping, HasOutputMapping):
+    """
+    Applies the TensorFlow graph to the array column in DataFrame.
+
+    Restrictions of the current API:
+
+    We assume that
+    - All the inputs of the graphs have a "minibatch" dimension (i.e. an unknown leading
+      dimension) in the tensor shapes.
+    - Input DataFrame has an array column where all elements have the same length
+    - The transformer is expected to work on blocks of data at the same time.
+    """
+
+    @keyword_only
+    def __init__(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None):
+        """
+        __init__(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None)
+        """
+        super(TFTransformer, self).__init__()
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None):
+        """
+        setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None)
+        """
+        super(TFTransformer, self).__init__()
+        kwargs = self._input_kwargs
+        # Further conanonicalization, e.g. converting dict to sorted str pairs happens here
+        return self._set(**kwargs)
+
+    def _optimize_for_inference(self):
+        """ Optimize the graph for inference """
+        gin = self.getTFInputGraph()
+        input_mapping = self.getInputMapping()
+        output_mapping = self.getOutputMapping()
+        input_node_names = [tfx.op_name(tnsr_name) for _, tnsr_name in input_mapping]
+        output_node_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
+
+        # NOTE(phi-dbq): Spark DataFrame assumes float64 as default floating point type
+        opt_gdef = infr_opt.optimize_for_inference(gin.graph_def,
+                                                   input_node_names,
+                                                   output_node_names,
+                                                   # TODO: below is the place to change for
+                                                   #       the `float64` data type issue.
+                                                   tf.float64.as_datatype_enum)
+        return opt_gdef
+
+    def _transform(self, dataset):
+        graph_def = self._optimize_for_inference()
+        input_mapping = self.getInputMapping()
+        output_mapping = self.getOutputMapping()
+
+        graph = tf.Graph()
+        with tf.Session(graph=graph):
+            analyzed_df = tfs.analyze(dataset)
+
+            out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
+            tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names)
+
+            feed_dict = dict((tfx.op_name(tnsr_name, graph), col_name)
+                             for col_name, tnsr_name in input_mapping)
+            fetches = [tfx.get_tensor(tnsr_op_name, graph) for tnsr_op_name in out_tnsr_op_names]
+
+            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
+
+            # We still have to rename output columns
+            for tnsr_name, new_colname in output_mapping:
+                old_colname = tfx.op_name(tnsr_name, graph)
+                if old_colname != new_colname:
+                    out_df = out_df.withColumnRenamed(old_colname, new_colname)
+
+        return out_df