|
| 1 | +# Copyright 2017 Databricks, Inc. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +# |
| 15 | +from __future__ import absolute_import, division, print_function |
| 16 | + |
| 17 | +import logging |
| 18 | +import tensorflow as tf |
| 19 | +from tensorflow.python.tools import optimize_for_inference_lib as infr_opt |
| 20 | +import tensorframes as tfs |
| 21 | + |
| 22 | +from pyspark.ml import Transformer |
| 23 | + |
| 24 | +import sparkdl.graph.utils as tfx |
| 25 | +from sparkdl.param import (keyword_only, HasInputMapping, HasOutputMapping, |
| 26 | + HasTFInputGraph, HasTFHParams) |
| 27 | + |
| 28 | +__all__ = ['TFTransformer'] |
| 29 | + |
| 30 | +logger = logging.getLogger('sparkdl') |
| 31 | + |
| 32 | +class TFTransformer(Transformer, HasTFInputGraph, HasTFHParams, HasInputMapping, HasOutputMapping): |
| 33 | + """ |
| 34 | + Applies the TensorFlow graph to the array column in DataFrame. |
| 35 | +
|
| 36 | + Restrictions of the current API: |
| 37 | +
|
| 38 | + We assume that |
| 39 | + - All the inputs of the graphs have a "minibatch" dimension (i.e. an unknown leading |
| 40 | + dimension) in the tensor shapes. |
| 41 | + - Input DataFrame has an array column where all elements have the same length |
| 42 | + - The transformer is expected to work on blocks of data at the same time. |
| 43 | + """ |
| 44 | + |
| 45 | + @keyword_only |
| 46 | + def __init__(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None): |
| 47 | + """ |
| 48 | + __init__(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None) |
| 49 | + """ |
| 50 | + super(TFTransformer, self).__init__() |
| 51 | + kwargs = self._input_kwargs |
| 52 | + self.setParams(**kwargs) |
| 53 | + |
| 54 | + @keyword_only |
| 55 | + def setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None): |
| 56 | + """ |
| 57 | + setParams(self, tfInputGraph=None, inputMapping=None, outputMapping=None, tfHParms=None) |
| 58 | + """ |
| 59 | + super(TFTransformer, self).__init__() |
| 60 | + kwargs = self._input_kwargs |
| 61 | + # Further conanonicalization, e.g. converting dict to sorted str pairs happens here |
| 62 | + return self._set(**kwargs) |
| 63 | + |
| 64 | + def _optimize_for_inference(self): |
| 65 | + """ Optimize the graph for inference """ |
| 66 | + gin = self.getTFInputGraph() |
| 67 | + input_mapping = self.getInputMapping() |
| 68 | + output_mapping = self.getOutputMapping() |
| 69 | + input_node_names = [tfx.op_name(tnsr_name) for _, tnsr_name in input_mapping] |
| 70 | + output_node_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] |
| 71 | + |
| 72 | + # NOTE(phi-dbq): Spark DataFrame assumes float64 as default floating point type |
| 73 | + opt_gdef = infr_opt.optimize_for_inference(gin.graph_def, |
| 74 | + input_node_names, |
| 75 | + output_node_names, |
| 76 | + # TODO: below is the place to change for |
| 77 | + # the `float64` data type issue. |
| 78 | + tf.float64.as_datatype_enum) |
| 79 | + return opt_gdef |
| 80 | + |
| 81 | + def _transform(self, dataset): |
| 82 | + graph_def = self._optimize_for_inference() |
| 83 | + input_mapping = self.getInputMapping() |
| 84 | + output_mapping = self.getOutputMapping() |
| 85 | + |
| 86 | + graph = tf.Graph() |
| 87 | + with tf.Session(graph=graph): |
| 88 | + analyzed_df = tfs.analyze(dataset) |
| 89 | + |
| 90 | + out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] |
| 91 | + tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) |
| 92 | + |
| 93 | + feed_dict = dict((tfx.op_name(tnsr_name, graph), col_name) |
| 94 | + for col_name, tnsr_name in input_mapping) |
| 95 | + fetches = [tfx.get_tensor(tnsr_op_name, graph) for tnsr_op_name in out_tnsr_op_names] |
| 96 | + |
| 97 | + out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) |
| 98 | + |
| 99 | + # We still have to rename output columns |
| 100 | + for tnsr_name, new_colname in output_mapping: |
| 101 | + old_colname = tfx.op_name(tnsr_name, graph) |
| 102 | + if old_colname != new_colname: |
| 103 | + out_df = out_df.withColumnRenamed(old_colname, new_colname) |
| 104 | + |
| 105 | + return out_df |
0 commit comments