diff --git a/.clang-format b/.clang-format
index 3963621..fe2e05b 100644
--- a/.clang-format
+++ b/.clang-format
@@ -6,24 +6,31 @@
 # The basic usage is,
 #   clang-format -i -style=file PATH/TO/SOURCE/CODE
 #
-# The -style=file implicit use ".clang-format" file located in one of 
-# parent directory. 
+# The -style=file implicit use ".clang-format" file located in one of
+# parent directory.
 # The -i means inplace change.
 #
-# The document of clang-format is 
+# The document of clang-format is
 #   http://clang.llvm.org/docs/ClangFormat.html
 #   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 ---
 Language:        Cpp
 BasedOnStyle:  Google
-IndentWidth:     2
-TabWidth:        2
+IndentWidth:     4
+TabWidth:        4
 ContinuationIndentWidth: 4
-AccessModifierOffset: -1  # The private/protected/public has no indent in class
-Standard:  Cpp11 
+AccessModifierOffset: -4  # The private/protected/public has no indent in class
+Standard:  Cpp11
 AllowAllParametersOfDeclarationOnNextLine: true
-BinPackParameters: false
+BinPackParameters: true
 BinPackArguments: false
-IncludeBlocks: Preserve
+ColumnLimit: 160
 IncludeIsMainSourceRegex: (\.cu)$
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
 ...
diff --git a/csrc/litert.cpp b/csrc/litert.cpp
index 439b22e..64556b2 100644
--- a/csrc/litert.cpp
+++ b/csrc/litert.cpp
@@ -39,12 +39,11 @@ static int32_t DIOPIRT_LOG_LEVEL = 0;
 
 static char szVersion[256] = {0};
 
-DIOPI_RT_API const char* diopiGetVersion()
-{
+DIOPI_RT_API const char* diopiGetVersion() {
     static bool inited = false;
     if (!inited) {
         inited = true;
-        sprintf(szVersion, "DIOPI Version: %d.%d.%d", DIOPI_VER_MAJOR, DIOPI_VER_MINOR, DIOPI_VER_PATCH);
+        sprintf(szVersion, "DIOPI Version: %d", DIOPI_VER_MAJOR * 1000 + DIOPI_VER_MINOR * 100 + DIOPI_VER_PATCH);
     }
     return szVersion;
 }
diff --git a/python/tests/test_stream.py b/python/tests/test_stream.py
index 70f6b23..5402271 100644
--- a/python/tests/test_stream.py
+++ b/python/tests/test_stream.py
@@ -1,108 +1,30 @@
 import numpy as np
-import time
 from threading import Thread
-from conformance.diopi_runtime import Context, Tensor, Sizes
-from conformance.utils import check_function, logger
-from ctypes import c_int32
+from conformance.diopi_runtime import Context, Tensor
+from conformance.diopi_runtime import Device
+from conformance.dtype import Dtype
 
 
 class TestStream(object):
-    # To do stream tests, the following workflow is used:
-    # begin = time.time()
-    # for i in range(nums):
-    #   y = mat1 @ mat2
-    #   mat1 = y
-    # Using Tensor.numpy() to sync stream, 'sum' is helpful to reduce the cost of memcpy
-    # res = sum(mat1)
-    # res_ndarray = Tensor.numpy(res)
-    # end = time.time()
     context = Context()
     context1 = Context()
     stream = context.get_handle()
     stream1 = context1.get_handle()
-    nums = 10
-    bmm_func = check_function("diopiMatmul")
-    sum_func = check_function("diopiSum")
 
-    @classmethod
-    def setup_class(self):
-        # generate numpy data
-        mat1_shape = (2, 32, 1024)
-        mat2_shape = (2, 1024, 1024)
-        self.mat1_ndarray = np.random.randn(*mat1_shape).astype(np.float32)
-        self.mat2_ndarray = np.random.randn(*mat2_shape).astype(np.float32)
-
-        out_ndarray = np.copy(self.mat1_ndarray)
-        for i in range(self.nums):
-            out_ndarray = np.matmul(out_ndarray, self.mat2_ndarray)
-        self.out_ref_ndarry = np.sum(out_ndarray)
-
-    def gen_device_data(self, stream):
-        # from_numpy call cudaMalloc which can not be concurrent with other missions on stream
-        mat1_tensor = Tensor.from_numpy(self.mat1_ndarray, context_handle=stream)
-        mat2_tensor = Tensor.from_numpy(self.mat2_ndarray, context_handle=stream)
-        out_tensor = Tensor.raw_like(mat1_tensor)
-        res_tensor = Tensor([], mat1_tensor.get_dtype(), context_handle=stream)
-        return mat1_tensor, mat2_tensor, out_tensor, res_tensor
-
-    def call_func(self, stream):
-        mat1, mat2, out, res = self.gen_device_data(stream)
-        # Allocate all the device memory in advance,
-        # so we can assure that stream will not be interrupted by device api like xxxmalloc()
-        begin = time.time()
-        for i in range(self.nums):
-            self.bmm_func(stream, out.tensor_handle, mat1.tensor_handle, mat2.tensor_handle)
-            tmp = out
-            out = mat1
-            mat1 = tmp
-
-        dim = Sizes((0, 1, 2))
-        dtype = res.get_dtype()
-        self.sum_func(stream, res.tensor_handle, mat1.tensor_handle, dim, c_int32(dtype.value))
-        out_ndarray = Tensor.numpy(res)
-        end = time.time()
-
-        assert np.allclose(out_ndarray, self.out_ref_ndarry, 1e-2, 1e-1, True)
-        return end - begin
+    def check_get_device_data(self, stream):
+        res_tensor = Tensor([2,2], Dtype.float32, context_handle=stream)
+        assert res_tensor.get_device() == Device.AIChip
 
     def test_stream(self):
-        # warm up
-        cost = self.call_func(self.stream)
-        logger.info(f"warming-up costs: {cost}s")
+        self.check_get_device_data(self.stream)
 
     def test_multi_stream(self):
-        mat1, mat2, out, res = self.gen_device_data(self.stream)
-        mat1_s1, mat2_s1, out_s1, res_s1 = self.gen_device_data(self.stream1)
-
-        baseline = self.call_func(self.stream)
-
-        begin = time.time()
-        for i in range(self.nums):
-            self.bmm_func(self.stream, out.tensor_handle, mat1.tensor_handle, mat2.tensor_handle)
-            self.bmm_func(self.stream1, out_s1.tensor_handle, mat1_s1.tensor_handle, mat2_s1.tensor_handle)
-            tmp = out
-            tmp_s1 = out_s1
-            out = mat1
-            out_s1 = mat1_s1
-            mat1 = tmp
-            mat1_s1 = tmp_s1
-
-        dim1 = Sizes((0, 1, 2))
-        dtype = res.get_dtype()
-        self.sum_func(self.stream, res.tensor_handle, mat1.tensor_handle, dim1, c_int32(dtype.value))
-        self.sum_func(self.stream1, res_s1.tensor_handle, mat1_s1.tensor_handle, dim1, c_int32(dtype.value))
-        out_ndarray = Tensor.numpy(res)
-        out_s1_ndarray = Tensor.numpy(res_s1)
-        end = time.time()
-
-        logger.info(f"after warming-up, one stream costs: {baseline}s, two streams costs: {end - begin}s")
-        assert (end - begin) < 1.8 * baseline, "don't improve 20% performance by concurrent stream"
-        assert np.allclose(out_ndarray, self.out_ref_ndarry, 1e-2, 1e-1, True)
-        assert np.allclose(out_s1_ndarray, self.out_ref_ndarry, 1e-2, 1e-1, True)
+        self.check_get_device_data(self.stream)
+        self.check_get_device_data(self.stream1)
 
     def test_multi_thread_multi_stream(self):
-        thread_1 = Thread(target=self.call_func, args=(self.stream, ))
-        thread_2 = Thread(target=self.call_func, args=(self.stream1, ))
+        thread_1 = Thread(target=self.check_get_device_data, args=(self.stream))
+        thread_2 = Thread(target=self.check_get_device_data, args=(self.stream1))
         thread_1.start()
         thread_2.start()
         thread_1.join()