diff --git a/README-cn.md b/README-cn.md
index 5404fda..01ae8c6 100644
--- a/README-cn.md
+++ b/README-cn.md
@@ -85,7 +85,7 @@ cmake --build build-macos --target all
 #### 3. 运行示例应用程序
 
 ```bash
-./build/bin/system_test ../obj
+./build/bin/system_test ./obj
 ```
 
 ---
diff --git a/README.md b/README.md
index 95981fd..fab00dc 100755
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ cmake --build build-macos --target all
 #### 3. Run the Example Application
 
 ```bash
-./build/bin/system_test ../obj
+./build/bin/system_test ./obj
 ```
 
 ---
diff --git a/src/include/face.hpp b/src/include/face.hpp
index 28a5b30..49f0754 100644
--- a/src/include/face.hpp
+++ b/src/include/face.hpp
@@ -40,7 +40,7 @@ class Face {
   // Get functions
   // 获取函数
   inline const std::array<size_t, 3>& GetIndices() const { return indices_; }
-  inline const size_t GetIndex(size_t index) const { return indices_[index]; }
+  inline size_t GetIndex(size_t index) const { return indices_[index]; }
   inline const Material& GetMaterial() const { return material_; }
 
  private:
diff --git a/src/include/log_system.h b/src/include/log_system.h
index a1f2903..2f8d9c4 100755
--- a/src/include/log_system.h
+++ b/src/include/log_system.h
@@ -17,6 +17,9 @@
 #ifndef SIMPLERENDER_SRC_INCLUDE_LOG_SYSTEM_H_
 #define SIMPLERENDER_SRC_INCLUDE_LOG_SYSTEM_H_
 
+#ifndef SPDLOG_ACTIVE_LEVEL
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
+#endif
 #include <spdlog/spdlog.h>
 
 namespace simple_renderer {
diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp
index 749aa28..cd0b349 100644
--- a/src/include/rasterizer.hpp
+++ b/src/include/rasterizer.hpp
@@ -3,6 +3,7 @@
 
 #include "config.h"
 #include "shader.hpp"
+#include "vertex.hpp"
 
 namespace simple_renderer {
 
@@ -15,21 +16,80 @@ class Rasterizer {
   auto operator=(Rasterizer&& rasterizer) -> Rasterizer& = default;
   ~Rasterizer() = default;
 
+  /**
+   * @brief 构造具有指定尺寸的光栅化器
+   * @param width 光栅化器宽度
+   * @param height 光栅化器高度
+   */
   Rasterizer(size_t width, size_t height);
 
+  /**
+   * @brief 光栅化三角形，生成片段列表
+   * @param v0 三角形第一个顶点
+   * @param v1 三角形第二个顶点
+   * @param v2 三角形第三个顶点
+   * @return 生成的片段向量
+   */
   std::vector<Fragment> Rasterize(const Vertex& v0, const Vertex& v1,
                                   const Vertex& v2);
 
+  /**
+   * @brief 非分配版本：将片段直接写入调用方提供的容器
+   * 
+   * 可选的裁剪区域为半开区间 [x0, x1) × [y0, y1)
+   * 用于 TBR：将光栅化限制在 tile 边界内，便于复用外部 scratch 容器
+   * 
+   * @param v0 三角形第一个顶点
+   * @param v1 三角形第二个顶点
+   * @param v2 三角形第三个顶点
+   * @param x0 裁剪区域左边界（包含）
+   * @param y0 裁剪区域上边界（包含）
+   * @param x1 裁剪区域右边界（不包含）
+   * @param y1 裁剪区域下边界（不包含）
+   * @param out 输出片段容器
+   */
+  void RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
+                   int x0, int y0, int x1, int y1,
+                   std::vector<Fragment>& out);
+
+  /**
+   * @brief SoA 版本：按顶点索引从 SoA 读取三角形三顶点
+   * @param soa 结构体数组格式的顶点数据
+   * @param i0 三角形第一个顶点索引
+   * @param i1 三角形第二个顶点索引
+   * @param i2 三角形第三个顶点索引
+   * @param x0 裁剪区域左边界（包含）
+   * @param y0 裁剪区域上边界（包含）
+   * @param x1 裁剪区域右边界（不包含）
+   * @param y1 裁剪区域下边界（不包含）
+   * @param out 输出片段容器
+   */
+  void RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
+                   int x0, int y0, int x1, int y1,
+                   std::vector<Fragment>& out);
+
  private:
   size_t width_, height_;
 
+  // 透视矫正结果
+  struct PerspectiveCorrectionResult {
+    Vector3f corrected_barycentric;
+    float interpolated_z;
+  };
+
+  // 透视矫正helper函数
+  PerspectiveCorrectionResult PerformPerspectiveCorrection(
+      float w0, float w1, float w2,
+      float z0, float z1, float z2,
+      const Vector3f& original_barycentric) const;
+
   template <typename T>
   T Interpolate(const T& v0, const T& v1, const T& v2,
-                const Vector3f& barycentric_coord);
+                const Vector3f& barycentric_coord) const;
 
   Color InterpolateColor(const Color& color0, const Color& color1,
                          const Color& color2,
-                         const Vector3f& barycentric_coord);
+                         const Vector3f& barycentric_coord) const;
 
   std::pair<bool, Vector3f> GetBarycentricCoord(const Vector3f& p0,
                                                 const Vector3f& p1,
diff --git a/src/include/renderer.h b/src/include/renderer.h
index bcc136f..e11c93f 100755
--- a/src/include/renderer.h
+++ b/src/include/renderer.h
@@ -18,57 +18,95 @@
 #define SIMPLERENDER_SRC_INCLUDE_RENDERER_H_
 
 #include <cstdint>
-#include <functional>
-#include <span>
+#include <memory>
+#include <string>
 
-#include "buffer.hpp"
-#include "light.h"
 #include "log_system.h"
-#include "math.hpp"
 #include "model.hpp"
-#include "rasterizer.hpp"
 #include "shader.hpp"
+#include "renderers/renderer_base.hpp"
 
 namespace simple_renderer {
 
+// 渲染模式枚举
+/**
+ * @brief 渲染模式
+ * - PER_TRIANGLE: 逐三角形（triangle-major）前向渲染
+ * - TILE_BASED: 基于 tile（tile-major）前向渲染
+ * - DEFERRED: 延迟渲染（片段收集后再着色）
+ */
+enum class RenderingMode {
+  PER_TRIANGLE,  //!< 逐三角形（triangle-major）
+  TILE_BASED,    //!< 基于 tile（tile-major）
+  DEFERRED       //!< 延迟渲染
+};
+
+/**
+ * @brief 将渲染模式枚举转为可读字符串
+ * @param mode 渲染模式
+ * @return 可读字符串（PER_TRIANGLE/TILE_BASED/DEFERRED）
+ */
+std::string RenderingModeToString(RenderingMode mode);
+
+/**
+ * @brief 渲染门面（Facade）
+ *
+ * 职责：
+ * - 仅作为模式选择与调用入口；
+ * - 根据 `RenderingMode` 构造并持有具体渲染器；
+ * - 对外暴露统一的 `DrawModel` 接口。
+ */
 class SimpleRenderer {
  public:
   /**
-   * 构造函数
-   * @param width
-   * @param height
-   * @param buffer 要进行绘制的内存区域，大小为 width*height*sizeof(uint32_t)
-   * @param
+   * @brief 构造渲染器门面
+   * @param width 画布宽度（像素）
+   * @param height 画布高度（像素）
    */
   SimpleRenderer(size_t width, size_t height);
+  ~SimpleRenderer() = default;
 
-  /// @name 默认构造/析构函数
-  /// @{
-  SimpleRenderer(const SimpleRenderer &_simplerenderer) = default;
-  SimpleRenderer(SimpleRenderer &&_simplerenderer) = default;
-  auto operator=(const SimpleRenderer &_simplerenderer) -> SimpleRenderer & =
-                                                               default;
-  auto operator=(SimpleRenderer &&_simplerenderer) -> SimpleRenderer & =
-                                                          default;
-  virtual ~SimpleRenderer() = default;
-  /// @}
+  /**
+   * @brief 绘制单个模型
+   * @param model 模型
+   * @param shader 着色器（含 uniform）
+   * @param buffer 输出颜色缓冲（width*height）
+   * @return 是否成功
+   */
+  bool DrawModel(const Model &model, const Shader &shader, uint32_t *buffer);
 
-  bool Render(const Model &model, const Shader &shader, uint32_t *buffer);
+  /**
+   * @brief 设置渲染模式
+   */
+  void SetRenderingMode(RenderingMode mode);
+  /**
+   * @brief 获取当前渲染模式
+   */
+  RenderingMode GetRenderingMode() const;
+
+  // 可选：配置参数（仅对 TileBasedRenderer 生效；运行中修改将重建 TBR 实例）
+  /**
+   * @brief 启用或禁用 Early‑Z（仅 TBR 有效）
+   */
+  void SetEarlyZEnabled(bool enabled);
+  /**
+   * @brief 设置 Tile 大小（仅 TBR 有效）
+   */
+  void SetTileSize(size_t tile_size);
+
+ private:
+  void EnsureRenderer();
 
  private:
   const size_t height_;
   const size_t width_;
   LogSystem log_system_;
+  RenderingMode current_mode_;
+  std::unique_ptr<RendererBase> renderer_;
 
-  std::shared_ptr<Shader> shader_;
-  std::shared_ptr<Rasterizer> rasterizer_;
-
-  /**
-   * 绘制模型
-   * @param model 模型
-   */
-  void DrawModel(const Model &model, uint32_t *buffer);
-  void DrawModelSlower(const Model &model, uint32_t *buffer);
+  // TBR 配置缓存：在创建 TileBasedRenderer 时下发
+  bool tbr_early_z_ = true;
+  size_t tbr_tile_size_ = 64;
 };
 }  // namespace simple_renderer
 
diff --git a/src/include/renderers/deferred_renderer.hpp b/src/include/renderers/deferred_renderer.hpp
new file mode 100644
index 0000000..245f5f8
--- /dev/null
+++ b/src/include/renderers/deferred_renderer.hpp
@@ -0,0 +1,31 @@
+#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_DEFERRED_RENDERER_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_DEFERRED_RENDERER_HPP_
+
+#include "renderers/renderer_base.hpp"
+
+namespace simple_renderer {
+
+/**
+ * @brief 延迟渲染器（Deferred）
+ * 
+ * 组织处理方式模拟 OpenGL 在 GPU上的工作原理，模仿 GPU管线。
+ * 但相比于另外两个前向渲染实现，导致内存使用增加和渲染速度变慢。
+ * 
+ * 特点：
+ * - AoS 顶点路径；
+ * - 首先按像素收集所有片段并选择最近深度；
+ * - 再对选择的片段执行片段着色（模拟经典 GPU 管线的一种教学实现）。
+ * - 
+ */
+class DeferredRenderer final : public RendererBase {
+ public:
+  using RendererBase::RendererBase;
+  /**
+   * @copydoc RendererBase::Render
+   */
+  bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override;
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_RENDERERS_DEFERRED_RENDERER_HPP_
diff --git a/src/include/renderers/per_triangle_renderer.hpp b/src/include/renderers/per_triangle_renderer.hpp
new file mode 100644
index 0000000..e2cee62
--- /dev/null
+++ b/src/include/renderers/per_triangle_renderer.hpp
@@ -0,0 +1,28 @@
+#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_PER_TRIANGLE_RENDERER_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_PER_TRIANGLE_RENDERER_HPP_
+
+#include "renderers/renderer_base.hpp"
+
+namespace simple_renderer {
+
+/**
+ * @brief 逐三角形渲染器（Triangle‑Major）
+ *
+ * 特点：
+ * - AoS 顶点路径；
+ * - 每线程本地 framebuffer（depth/color）合并；
+ * - 背面剔除在屏幕空间完成；
+ * - 接近“传统”栈式前向渲染教学实现。
+ */
+class PerTriangleRenderer final : public RendererBase {
+ public:
+  using RendererBase::RendererBase;
+  /**
+   * @copydoc RendererBase::Render
+   */
+  bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override;
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_RENDERERS_PER_TRIANGLE_RENDERER_HPP_
diff --git a/src/include/renderers/renderer_base.hpp b/src/include/renderers/renderer_base.hpp
new file mode 100644
index 0000000..ad09ac7
--- /dev/null
+++ b/src/include/renderers/renderer_base.hpp
@@ -0,0 +1,66 @@
+// Renderer base and options
+#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_RENDERER_BASE_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_RENDERER_BASE_HPP_
+
+#include <cstdint>
+#include <memory>
+
+#include "rasterizer.hpp"
+#include "vertex.hpp"
+#include "model.hpp"
+#include "shader.hpp"
+
+namespace simple_renderer {
+
+
+/**
+ * @brief 渲染器抽象基类
+ *
+ * 约定：
+ * - Render 负责完成完整的渲染过程（顶点变换 + 光栅化 + 着色 + 写入输出缓冲）。
+ * - 子类选择不同的“组织单元”：（按照并行组织单元）逐三角形、按 tile、或延迟管线。
+ * - 公共的透视除法与视口变换在此提供，子类按需复用。
+ */
+class RendererBase {
+ public:
+  RendererBase(size_t width, size_t height)
+      : width_(width), height_(height), rasterizer_(std::make_shared<Rasterizer>(width, height)) {}
+  virtual ~RendererBase() = default;
+
+  RendererBase(const RendererBase&) = delete;
+  RendererBase& operator=(const RendererBase&) = delete;
+
+  /**
+   * @brief 执行一次渲染
+   * @param model 模型数据
+   * @param shader 着色器（包含材质/光照/矩阵等 uniform）
+   * @param out_color 输出颜色缓冲（大小为 width*height）
+   * @return 是否渲染成功
+   */
+  virtual bool Render(const Model& model, const Shader& shader, uint32_t* out_color) = 0;
+
+ protected:
+  /**
+   * @brief 透视除法：裁剪空间 -> NDC
+   * @param vertex 裁剪空间顶点
+   * @return NDC 顶点（保留 1/w 以供透视校正）
+   */
+  Vertex PerspectiveDivision(const Vertex& vertex);
+  /**
+   * @brief 视口变换：NDC -> 屏幕坐标
+   * @param vertex NDC 顶点
+   * @return 屏幕空间顶点
+   */
+  Vertex ViewportTransformation(const Vertex& vertex);
+
+ protected:
+  size_t width_;
+  size_t height_;
+  std::shared_ptr<Rasterizer> rasterizer_;
+
+  static constexpr float kMinWValue = 1e-6f;
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_RENDERERS_RENDERER_BASE_HPP_
diff --git a/src/include/renderers/tile_based_renderer.hpp b/src/include/renderers/tile_based_renderer.hpp
new file mode 100644
index 0000000..da7970c
--- /dev/null
+++ b/src/include/renderers/tile_based_renderer.hpp
@@ -0,0 +1,130 @@
+#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_RENDERER_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_RENDERER_HPP_
+
+#include "renderers/renderer_base.hpp"
+
+namespace simple_renderer {
+
+/**
+ * @brief Tile 中的三角形轻量引用（SoA 索引 + 材质指针）
+ */
+struct TileTriangleRef {
+  size_t i0, i1, i2;
+  const Material* material = nullptr;
+  size_t face_index = 0;
+};
+
+struct TileMaskStats {
+  uint64_t tested = 0; // 遍历检测像素总数
+  uint64_t covered = 0; // 三角形内覆盖测试通过像素数（通过边函数做内点测试成功）
+  uint64_t zpass = 0; // 通过early-z测试像素数（深度值小于tile局部深度缓冲）
+  uint64_t shaded = 0; // 实际着色并写回像素数（同时通过early-z或late-z测试）
+};
+
+/**
+ * @brief Tile 网格上下文（供 binning 和 raster 共享的网格/几何信息）
+ */
+struct TileGridContext {
+  const VertexSoA& soa;
+  size_t tiles_x;
+  size_t tiles_y;
+  size_t tile_size;
+};
+
+/**
+ * @brief 基于 Tile 的渲染器（Tile‑Major）
+ *
+ * 特点：
+ * - SoA 顶点布局；
+ * - 三角形按 tile 分箱（binning），每 tile 内局部 Early‑Z；
+ * - 单份全局 framebuffer，按 tile 覆盖范围直接拷贝回写；
+ * - 通过构造参数 early_z 与 tile_size 控制行为。
+ */
+class TileBasedRenderer final : public RendererBase {
+ public:
+  /**
+   * @brief 构造函数
+   * @param width 画布宽度
+   * @param height 画布高度
+   * @param early_z 是否启用 Early‑Z（默认启用）
+   * @param tile_size Tile 像素尺寸（默认 64）
+   */
+  TileBasedRenderer(size_t width, size_t height, bool early_z = true, size_t tile_size = 64)
+      : RendererBase(width, height), early_z_(early_z), tile_size_(tile_size) {}
+  /**
+   * @copydoc RendererBase::Render
+   */
+  bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override;
+
+ private:
+  /**
+   * @brief 将三角形按屏幕空间包围盒映射到 tile 网格
+   * @param model 模型（提供面/材质）
+   * @param soa 经过变换后的 SoA 顶点数据
+   * @param tile_triangles 输出：每个 tile 的三角形引用列表
+   * @param tiles_x 水平 tile 数
+   * @param tiles_y 垂直 tile 数
+   * @param tile_size tile 像素尺寸
+   */
+  void TriangleTileBinning(const Model& model,
+                           const TileGridContext& grid,
+                           std::vector<std::vector<TileTriangleRef>> &tile_triangles);
+
+  /**
+   * @brief 处理单个三角形的 tile binning 逻辑
+   * @param tri_idx 三角形索引
+   * @param count_only 是否仅进行计数（true=计数模式，false=填充模式）
+   * @param model 模型数据
+   * @param soa 经过变换后的 SoA 顶点数据
+   * @param tiles_x 水平 tile 数
+   * @param tiles_y 垂直 tile 数
+   * @param tile_size tile 像素尺寸
+   * @param tile_counts tile 计数数组的引用（计数模式时使用）
+   * @param tile_triangles tile 三角形引用列表（填充模式时使用）
+   */
+  void ProcessTriangleForTileBinning(
+      size_t tri_idx, bool count_only,
+      const Model& model,
+      const TileGridContext& grid,
+      std::vector<size_t>& tile_counts,
+      std::vector<std::vector<TileTriangleRef>>& tile_triangles);
+
+  /**
+   * @brief 光栅化单个 tile，并将结果写回全局 framebuffer
+   * @param tile_id tile 序号
+   * @param triangles 该 tile 覆盖的三角形引用
+   * @param tiles_x 水平 tile 数
+   * @param tiles_y 垂直 tile 数
+   * @param tile_size tile 像素尺寸
+   * @param tile_depth_buffer tile 局部深度缓冲（由调用方提供/复用）
+   * @param tile_color_buffer tile 局部颜色缓冲（由调用方提供/复用）
+   * @param global_depth_buffer 全局深度缓冲（单份）
+   * @param global_color_buffer 全局颜色缓冲（单份）
+   * @param soa 经过变换后的 SoA 顶点数据
+   * @param shader 着色器
+   * @param use_early_z 是否启用 Early‑Z
+   * @param scratch_fragments 可复用片段临时容器
+   */
+  void RasterizeTile(size_t tile_id,
+                     const std::vector<TileTriangleRef> &triangles,
+                     const TileGridContext& grid,
+                     float* tile_depth_buffer, uint32_t* tile_color_buffer,
+                     std::unique_ptr<float[]> &global_depth_buffer,
+                     std::unique_ptr<uint32_t[]> &global_color_buffer,
+                     const Shader& shader,
+                     bool use_early_z,
+                     std::vector<Fragment>* scratch_fragments,
+                     TileMaskStats* out_stats);
+
+ private:
+  // 深度和颜色的默认值，同时用于tile级和全局级buffers的初始化
+  static constexpr float kDepthClear = 1.0f; // 默认为最远值，用于Early-Z
+  static constexpr uint32_t kColorClear = 0u; // 默认为黑色
+
+  const bool early_z_;
+  const size_t tile_size_;
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_RENDERER_HPP_
diff --git a/src/include/shader.hpp b/src/include/shader.hpp
index ed08998..8314f55 100644
--- a/src/include/shader.hpp
+++ b/src/include/shader.hpp
@@ -1,6 +1,10 @@
 #ifndef SIMPLERENDER_SRC_INCLUDE_SHADER_HPP_
 #define SIMPLERENDER_SRC_INCLUDE_SHADER_HPP_
 
+#include <array>
+#include <bit>
+#include <shared_mutex>
+#include <unordered_map>
 #include <variant>
 
 #include "light.h"
@@ -12,6 +16,8 @@ namespace simple_renderer {
 using UniformValue = std::variant<int, float, Vector2f, Vector3f, Vector4f,
                                   Matrix3f, Matrix4f, Material, Light>;
 
+inline constexpr size_t kSpecularLutResolution = 256;
+
 class UniformBuffer {
  public:
   template <typename T>
@@ -63,6 +69,32 @@ struct SharedDataInShader {
   Vector3f fragPos_varying = Vector3f(0.0f);
 };
 
+struct VertexUniformCache {
+  Matrix4f model = Matrix4f(1.0f);
+  Matrix4f view = Matrix4f(1.0f);
+  Matrix4f projection = Matrix4f(1.0f);
+  Matrix4f model_view = Matrix4f(1.0f);
+  Matrix4f mvp = Matrix4f(1.0f);
+  Matrix3f normal = Matrix3f(1.0f);
+  bool has_model = false;
+  bool has_view = false;
+  bool has_projection = false;
+  bool derived_valid = false;
+};
+
+struct FragmentUniformCache {
+  Light light{};
+  Vector3f camera_pos = Vector3f(0.0f);
+  Vector3f light_dir_normalized = Vector3f(0.0f);
+  bool has_light = false;
+  bool has_camera = false;
+  bool derived_valid = false;
+};
+
+struct SpecularLUT {
+  std::array<float, kSpecularLutResolution> values{};
+};
+
 /**
  * @brief Shader Class 着色器类
  *
@@ -70,10 +102,10 @@ struct SharedDataInShader {
 class Shader {
  public:
   Shader() = default;
-  Shader(const Shader &shader) = default;
-  Shader(Shader &&shader) = default;
-  auto operator=(const Shader &shader) -> Shader & = default;
-  auto operator=(Shader &&shader) -> Shader & = default;
+  Shader(const Shader &shader);
+  Shader(Shader &&shader) noexcept;
+  auto operator=(const Shader &shader) -> Shader &;
+  auto operator=(Shader &&shader) noexcept -> Shader &;
   virtual ~Shader() = default;
 
   // Input Data -> Vertex Shader -> Screen Space Coordiante
@@ -85,8 +117,17 @@ class Shader {
   template <typename T>
   void SetUniform(const std::string &name, const T &value) {
     uniformbuffer_.SetUniform(name, value);
+    if constexpr (std::is_same_v<T, Matrix4f>) {
+      UpdateMatrixCache(name, value);
+    } else if constexpr (std::is_same_v<T, Light>) {
+      UpdateFragmentCache(name, value);
+    } else if constexpr (std::is_same_v<T, Vector3f>) {
+      UpdateFragmentCache(name, value);
+    }
   }
 
+  void PrepareUniformCaches();
+
  private:
   // UniformBuffer
   UniformBuffer uniformbuffer_;
@@ -94,6 +135,23 @@ class Shader {
   // Shared Variables
   // 共享变量
   SharedDataInShader sharedDataInShader_;
+  VertexUniformCache vertex_uniform_cache_;
+  FragmentUniformCache fragment_uniform_cache_;
+  mutable std::unordered_map<uint32_t, SpecularLUT> specular_lut_cache_;
+  mutable std::shared_mutex specular_cache_mutex_;
+
+  void UpdateMatrixCache(const std::string &name, const Matrix4f &value);
+  void UpdateFragmentCache(const std::string &name, const Light &value);
+  void UpdateFragmentCache(const std::string &name, const Vector3f &value);
+  void RecalculateDerivedMatrices();
+  void RecalculateFragmentDerived();
+  void PrepareVertexUniformCache();
+  void PrepareFragmentUniformCache();
+
+  // LUT相关
+  [[nodiscard]] auto BuildSpecularLUT(float shininess) const -> SpecularLUT;
+  [[nodiscard]] auto GetSpecularLUT(float shininess) const -> const SpecularLUT &;
+  [[nodiscard]] auto EvaluateSpecular(float cos_theta, float shininess) const -> float;
 
   Color SampleTexture(const Texture &texture, const Vector2f &uv) const;
   Color ClampColor(const Color color) const;
@@ -103,4 +161,4 @@ uint8_t FloatToUint8_t(float val);
 
 }  // namespace simple_renderer
 
-#endif /* SIMPLERENDER_SRC_INCLUDE_SHADER_H_ */
\ No newline at end of file
+#endif /* SIMPLERENDER_SRC_INCLUDE_SHADER_H_ */
diff --git a/src/include/vertex.hpp b/src/include/vertex.hpp
index 975abd0..b00f648 100644
--- a/src/include/vertex.hpp
+++ b/src/include/vertex.hpp
@@ -1,6 +1,9 @@
 #ifndef SIMPLERENDER_SRC_INCLUDE_VERTEX_HPP_
 #define SIMPLERENDER_SRC_INCLUDE_VERTEX_HPP_
 
+#include <vector>
+#include <optional>
+
 #include <math.hpp>
 
 #include "color.h"
@@ -31,10 +34,13 @@ class Vertex {
   // 析构函数
   ~Vertex() = default;
 
-  // Constructor with parameters 带参数的构造函数
+  // Constructor with parameters: optional clip space coordinate
+  // 带参数的构造函数：可选的裁剪空间坐标
   explicit Vertex(const Vector4f& pos, const Vector3f& norm,
-                  const Vector2f& tex, const Color& color_)
-      : position_(pos), normal_(norm), texCoords_(tex), color_(color_) {}
+                  const Vector2f& tex, const Color& color_,
+                  std::optional<Vector4f> clip_pos = std::nullopt)
+      : position_(pos), normal_(norm), texCoords_(tex), color_(color_),
+        clip_position_(clip_pos) {}
 
   // Transform the vertex with a matrix     使用矩阵变换顶点
   void transform(const Matrix4f& matrix) { position_ = matrix * position_; }
@@ -45,12 +51,19 @@ class Vertex {
   [[nodiscard]] inline Vector3f GetNormal() const { return normal_; }
   [[nodiscard]] inline Vector2f GetTexCoords() const { return texCoords_; }
   [[nodiscard]] inline Color GetColor() const { return color_; }
+  
+  // 扩展坐标访问
+  [[nodiscard]] inline std::optional<Vector4f> GetClipPosition() const { return clip_position_; }
+  [[nodiscard]] inline bool HasClipPosition() const { return clip_position_.has_value(); }
 
  private:
   Vector4f position_;   // 3D position, 3D顶点坐标
   Vector3f normal_;     // Normal vector, 顶点法向量
   Vector2f texCoords_;  // Texture coordinates, 顶点纹理坐标
   Color color_;
+  
+  // 扩展坐标用于裁剪优化
+  std::optional<Vector4f> clip_position_; // 裁剪空间坐标 (用于视锥体裁剪)
 };
 
 inline Vertex operator*(const Matrix4f& matrix, const Vertex& vertex) {
@@ -59,6 +72,26 @@ inline Vertex operator*(const Matrix4f& matrix, const Vertex& vertex) {
                 vertex.GetColor());
 }
 
+// Minimal SoA layout for TBR pipeline
+struct VertexSoA {
+  // 屏幕空间坐标（视口变换后）
+  std::vector<Vector4f> pos_screen;  // screen space position (x,y,z,w)
+  // 裁剪空间坐标（用于视锥体剔除）：clip = MVP * pos
+  std::vector<Vector4f> pos_clip;
+  std::vector<Vector3f> normal;
+  std::vector<Vector2f> uv;
+  std::vector<Color>    color;
+
+  inline size_t size() const { return pos_screen.size(); }
+  inline void resize(size_t n) {
+    pos_screen.resize(n);
+    pos_clip.resize(n);
+    normal.resize(n);
+    uv.resize(n);
+    color.resize(n);
+  }
+};
+
 }  // namespace simple_renderer
 
 #endif
\ No newline at end of file
diff --git a/src/light.cpp b/src/light.cpp
index f25fb4c..ae3a51d 100644
--- a/src/light.cpp
+++ b/src/light.cpp
@@ -27,7 +27,7 @@ const Vector3f Light::kDefaultDir = Vector3f(0, 0, -1);
 const Color Light::kDefaultColor = Color::kWhite;
 
 Light::Light(const std::string &name) : name_(name) {
-  SPDLOG_INFO("Light: {}", name_);
+  SPDLOG_DEBUG("Light: {}", name_);
 }
 
 }  // namespace simple_renderer
diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index 8bf2d34..04aa6b1 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -1,12 +1,14 @@
 #include "rasterizer.hpp"
 
 #include <omp.h>
+#include <algorithm>
+#include <cmath>
 
 namespace simple_renderer {
 
 Rasterizer::Rasterizer(size_t width, size_t height)
     : width_(width), height_(height) {
-  SPDLOG_INFO("Rasterizer init with {}, {}", width, height);
+  SPDLOG_DEBUG("Rasterizer init with {}, {}", width, height);
 }
 
 std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
@@ -46,18 +48,25 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
         if (!is_inside) {
           continue;
         }
-        // 计算该点的深度，通过重心坐标插值计算
-        auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z,
-                             v2.GetPosition().z, barycentric_coord);
+
+        // 透视矫正插值
+        auto perspective_result = PerformPerspectiveCorrection(
+            v0.GetPosition().w, v1.GetPosition().w, v2.GetPosition().w,
+            v0.GetPosition().z, v1.GetPosition().z, v2.GetPosition().z,
+            barycentric_coord);
+        
+        const Vector3f& corrected_bary = perspective_result.corrected_barycentric;
+        float z = perspective_result.interpolated_z;
+
 
         Fragment fragment;
         fragment.screen_coord = {x, y};
-        fragment.normal = CalculateNormal(v0.GetPosition(), v1.GetPosition(),
-                                          v2.GetPosition());
+        fragment.normal = Interpolate(v0.GetNormal(), v1.GetNormal(),
+                                      v2.GetNormal(), corrected_bary);
         fragment.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(),
-                                  v2.GetTexCoords(), barycentric_coord);
+                                  v2.GetTexCoords(), corrected_bary);
         fragment.color = InterpolateColor(v0.GetColor(), v1.GetColor(),
-                                          v2.GetColor(), barycentric_coord);
+                                          v2.GetColor(), corrected_bary);
         fragment.depth = z;
 
         local_fragments.push_back(fragment);
@@ -72,6 +81,159 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
   return fragments;
 }
 
+void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
+                             int x0, int y0, int x1, int y1,
+                             std::vector<Fragment>& out) {
+  // 获取三角形的最小 box（屏幕空间）
+  const Vector4f p0 = v0.GetPosition();
+  const Vector4f p1 = v1.GetPosition();
+  const Vector4f p2 = v2.GetPosition();
+
+  Vector2f a(p0.x, p0.y);
+  Vector2f b(p1.x, p1.y);
+  Vector2f c(p2.x, p2.y);
+
+  Vector2f bboxMin = Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})};
+  Vector2f bboxMax = Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})};
+
+  // Clamp 到屏幕尺寸
+  float minx = std::max(0.0f, bboxMin.x);
+  float miny = std::max(0.0f, bboxMin.y);
+  float maxx = std::min(float(width_ - 1), bboxMax.x);
+  float maxy = std::min(float(height_ - 1), bboxMax.y);
+
+  // 与外部提供的裁剪区域相交（半开区间） -> 闭区间扫描
+  int sx = std::max(x0, static_cast<int>(std::floor(minx)));
+  int sy = std::max(y0, static_cast<int>(std::floor(miny)));
+  int ex = std::min(x1 - 1, static_cast<int>(std::floor(maxx)));
+  int ey = std::min(y1 - 1, static_cast<int>(std::floor(maxy)));
+  if (sx > ex || sy > ey) return;
+
+  for (int x = sx; x <= ex; ++x) {
+    for (int y = sy; y <= ey; ++y) {
+      auto [is_inside, bary] = GetBarycentricCoord(
+          Vector3f(p0.x, p0.y, p0.z), Vector3f(p1.x, p1.y, p1.z), Vector3f(p2.x, p2.y, p2.z),
+          Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
+      if (!is_inside) continue;
+
+      // 透视矫正插值
+      auto perspective_result = PerformPerspectiveCorrection(
+          p0.w, p1.w, p2.w,
+          p0.z, p1.z, p2.z,
+          bary);
+
+      const Vector3f& corrected_bary = perspective_result.corrected_barycentric;
+      float z = perspective_result.interpolated_z;
+
+      Fragment frag; // material 指针由调用方填写
+      frag.screen_coord = {x, y};
+      frag.normal = Interpolate(v0.GetNormal(), v1.GetNormal(), v2.GetNormal(), corrected_bary);
+      frag.uv     = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(), v2.GetTexCoords(), corrected_bary);
+      frag.color  = InterpolateColor(v0.GetColor(), v1.GetColor(), v2.GetColor(), corrected_bary);
+      frag.depth  = z;
+
+      out.push_back(frag);
+    }
+  }
+}
+
+void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
+                             int x0, int y0, int x1, int y1,
+                             std::vector<Fragment>& out) {
+  // 读取三顶点的屏幕空间位置
+  const Vector4f& p0 = soa.pos_screen[i0];
+  const Vector4f& p1 = soa.pos_screen[i1];
+  const Vector4f& p2 = soa.pos_screen[i2];
+
+  // 为BarycentricCoord预构造Vec3f，避免循环内重复构造
+  const Vector3f sp0(p0.x, p0.y, p0.z);
+  const Vector3f sp1(p1.x, p1.y, p1.z);
+  const Vector3f sp2(p2.x, p2.y, p2.z);
+
+  // 计算屏幕空间AABB包围盒
+  const float minx_f = std::max(0.0f, std::min({p0.x, p1.x, p2.x}));
+  const float miny_f = std::max(0.0f, std::min({p0.y, p1.y, p2.y}));
+  const float maxx_f = std::min(float(width_  - 1), std::max({p0.x, p1.x, p2.x}));
+  const float maxy_f = std::min(float(height_ - 1), std::max({p0.y, p1.y, p2.y}));
+
+  // 与外部提供的裁剪区域相交（半开区间） -> 闭区间扫描
+  int sx = std::max(x0, static_cast<int>(std::floor(minx_f)));
+  int sy = std::max(y0, static_cast<int>(std::floor(miny_f)));
+  int ex = std::min(x1 - 1, static_cast<int>(std::floor(maxx_f)));
+  int ey = std::min(y1 - 1, static_cast<int>(std::floor(maxy_f)));
+  if (sx > ex || sy > ey) return;
+
+  // 预计算边函数系数：E(x,y) = A*x + B*y + C
+  // 使用相对坐标的边函数定义，避免大常数项导致的数值不稳定
+  // 如使用绝对形式Ax+By+C会由于常数C的量级过大，造成浮点抵消，有效位丢失不稳定
+  auto cross2 = [](float ax, float ay, float bx, float by) {
+    return ax * by - ay * bx;
+  };
+  // 边向量
+  const float e01x = p1.x - p0.x, e01y = p1.y - p0.y; // (p0->p1)
+  const float e12x = p2.x - p1.x, e12y = p2.y - p1.y; // (p1->p2)
+  const float e20x = p0.x - p2.x, e20y = p0.y - p2.y; // (p2->p0)
+
+  // 有向面积（两倍），用相对面积定义：area2 = cross(p1 - p0, p2 - p0)
+  float area2 = cross2(e01x, e01y, p2.x - p0.x, p2.y - p0.y);
+  if (std::abs(area2) < 1e-6f) return; // 退化三角形
+  const float inv_area2 = 1.0f / area2;
+  const bool positive = (area2 > 0.0f);
+
+  // 行优先遍历：有利于 cache 与向量化
+  #pragma omp simd
+  for (int y = sy; y <= ey; ++y) {
+    const float yf = static_cast<float>(y);
+
+    // 注意：此处存在对 out.push_back 的写入，属于有副作用操作，不适合使用
+    // omp simd 进行强制向量化，否则可能导致不符合预期的行为（如周期性伪影）。
+    // 先保持标量内层，后续如切换为“直写像素回调”再考虑安全的 SIMD 化。
+    for (int x = sx; x <= ex; ++x) {
+      const float xf = static_cast<float>(x);
+
+      // 相对坐标边函数：
+      // E01(p) = cross(p1 - p0, p - p0)
+      // E12(p) = cross(p2 - p1, p - p1)
+      // E20(p) = cross(p0 - p2, p - p2)
+      const float E01 = cross2(e01x, e01y, xf - p0.x, yf - p0.y);
+      const float E12 = cross2(e12x, e12y, xf - p1.x, yf - p1.y);
+      const float E20 = cross2(e20x, e20y, xf - p2.x, yf - p2.y);
+
+      // 半空间测试（根据朝向选择符号）
+      const bool inside = positive ? (E01 >= 0.0f && E12 >= 0.0f && E20 >= 0.0f)
+                                   : (E01 <= 0.0f && E12 <= 0.0f && E20 <= 0.0f);
+      if (!inside) continue;
+
+      // 重心权重映射：
+      // b0 对应 v0，取与对边 (v1,v2) 的子面积 → E12
+      // b1 对应 v1 → E20
+      // b2 对应 v2 → E01
+      const float b0 = E12 * inv_area2;
+      const float b1 = E20 * inv_area2;
+      const float b2 = E01 * inv_area2;
+      const Vector3f bary(b0, b1, b2);
+
+      // 透视矫正插值
+      auto perspective_result = PerformPerspectiveCorrection(
+          p0.w, p1.w, p2.w,
+          p0.z, p1.z, p2.z,
+          bary);
+
+      const Vector3f& corrected_bary = perspective_result.corrected_barycentric;
+      const float z = perspective_result.interpolated_z;
+
+      Fragment frag; // Note: material 指针由调用方填写
+      frag.screen_coord = {x, y};
+      frag.normal = Interpolate(soa.normal[i0], soa.normal[i1], soa.normal[i2], corrected_bary);
+      frag.uv     = Interpolate(soa.uv[i0],     soa.uv[i1],     soa.uv[i2],     corrected_bary);
+      frag.color  = InterpolateColor(soa.color[i0], soa.color[i1], soa.color[i2], corrected_bary);
+      frag.depth  = z;
+
+      out.push_back(frag);
+    }
+  }
+}
+
 std::pair<bool, Vector3f> Rasterizer::GetBarycentricCoord(const Vector3f& p0,
                                                           const Vector3f& p1,
                                                           const Vector3f& p2,
@@ -98,17 +260,17 @@ std::pair<bool, Vector3f> Rasterizer::GetBarycentricCoord(const Vector3f& p0,
 
   return std::pair<bool, const Vector3f>{true, Vector3f(x, y, z)};
 }
-
+ 
 template <typename T>
 T Rasterizer::Interpolate(const T& v0, const T& v1, const T& v2,
-                          const Vector3f& barycentric_coord) {
+                          const Vector3f& barycentric_coord) const {
   return v0 * barycentric_coord.x + v1 * barycentric_coord.y +
          v2 * barycentric_coord.z;
 }
 
 Color Rasterizer::InterpolateColor(const Color& color0, const Color& color1,
                                    const Color& color2,
-                                   const Vector3f& barycentric_coord) {
+                                   const Vector3f& barycentric_coord) const {
   auto color_r = FloatToUint8_t(
       static_cast<float>(color0[Color::kColorIndexRed]) * barycentric_coord.x +
       static_cast<float>(color1[Color::kColorIndexRed]) * barycentric_coord.y +
@@ -127,6 +289,31 @@ Color Rasterizer::InterpolateColor(const Color& color0, const Color& color1,
   return Color(color_r, color_g, color_b);
 }
 
+// 透视矫正helper函数：在透视投影下，1/w 在屏幕空间中是线性的// 因此需要先对 1/w 进行插值，再用结果矫正其他属性
+Rasterizer::PerspectiveCorrectionResult Rasterizer::PerformPerspectiveCorrection(
+    float w0, float w1, float w2,
+    float z0, float z1, float z2,
+    const Vector3f& original_barycentric) const {
+    
+  // 1. 插值 1/w （注意：这里传入的w0,w1,w2是原始的w值，需要先求倒数）
+  float w0_inv = 1.0f / w0;
+  float w1_inv = 1.0f / w1;
+  float w2_inv = 1.0f / w2;
+  float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, original_barycentric);
+  
+  // 2. 计算透视矫正的重心坐标
+  Vector3f corrected_barycentric(
+      original_barycentric.x * w0_inv / w_inv_interpolated,
+      original_barycentric.y * w1_inv / w_inv_interpolated,
+      original_barycentric.z * w2_inv / w_inv_interpolated
+  );
+  
+  // 3. 使用矫正的重心坐标插值深度值
+  float interpolated_z = Interpolate(z0, z1, z2, corrected_barycentric);
+  
+  return {corrected_barycentric, interpolated_z};
+}
+
 // Calculate the normal vector based on the vertices
 // 根据顶点计算法向量
 Vector3f Rasterizer::CalculateNormal(const Vector3f& v0, const Vector3f& v1,
@@ -139,4 +326,4 @@ Vector3f Rasterizer::CalculateNormal(const Vector3f& v0, const Vector3f& v1,
       glm::cross(edge1, edge2));
 }
 
-}  // namespace simple_renderer
\ No newline at end of file
+}  // namespace simple_renderer
diff --git a/src/renderer.cpp b/src/renderer.cpp
old mode 100755
new mode 100644
index c7a5769..0939cf5
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -1,277 +1,83 @@
-
-/**
- * @file simple_renderer.cpp
- * @brief SimpleRenderer 实现
- * @author Zone.N (Zone.Niuzh@hotmail.com)
- * @version 1.0
- * @date 2023-10-23
- * @copyright MIT LICENSE
- * https://github.com/Simple-XX/SimpleRenderer
- * @par change log:
- * <table>
- * <tr><th>Date<th>Author<th>Description
- * <tr><td>2023-10-23<td>Zone.N<td>创建文件
- * </table>
- */
-
 #include "renderer.h"
 
-#include <omp.h>
-
-#include <array>
-#include <cstdint>
-#include <limits>
-#include <span>
-#include <string_view>
-#include <vector>
+#include <string>
 
 #include "config.h"
-#include "light.h"
-#include "log_system.h"
-#include "model.hpp"
+#include "renderers/per_triangle_renderer.hpp"
+#include "renderers/tile_based_renderer.hpp"
+#include "renderers/deferred_renderer.hpp"
 
 namespace simple_renderer {
 
+std::string RenderingModeToString(RenderingMode mode) {
+  switch(mode) {
+    case RenderingMode::PER_TRIANGLE: return "PER_TRIANGLE";
+    case RenderingMode::TILE_BASED:  return "TILE_BASED";
+    case RenderingMode::DEFERRED:    return "DEFERRED";
+  }
+  return "PER_TRIANGLE";
+}
+
 SimpleRenderer::SimpleRenderer(size_t width, size_t height)
     : height_(height),
       width_(width),
-      log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)) {
-  rasterizer_ = std::make_shared<Rasterizer>(width, height);
+      log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)),
+      current_mode_(RenderingMode::TILE_BASED) {
+  tbr_early_z_ = true;
+  tbr_tile_size_ = 64;
+  EnsureRenderer();
 }
 
-bool SimpleRenderer::Render(const Model &model, const Shader &shader,
-                            uint32_t *buffer) {
-  SPDLOG_INFO("render model: {}", model.GetModelPath());
-  shader_ = std::make_shared<Shader>(shader);
-  DrawModel(model, buffer);
-  return true;
+bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader, uint32_t *buffer) {
+  EnsureRenderer(); // 确保渲染器实例存在
+  SPDLOG_DEBUG("draw model: {}", model.GetModelPath());
+  return renderer_->Render(model, shader, buffer);
 }
 
-/*
-Optimizes performance by performing depth testing during rasterization, keeping
-only the closest fragment per pixel, and avoiding storing all
-fragments—resulting in faster rendering.
-
-通过在光栅化过程中执行深度测试，仅保留每个像素的深度值最近的片段，避免存储所有片段，从而优化性能，实现更快的渲染。
-*/
-void SimpleRenderer::DrawModel(const Model &model, uint32_t *buffer) {
-  SPDLOG_INFO("draw {}", model.GetModelPath());
-
-  /* * * Vertex Shader * * */
-  std::vector<Vertex> processedVertices;
-  std::vector<std::vector<Vertex>> processed_vertices_all_thread(kNProc);
-#pragma omp parallel num_threads(kNProc) default(none) \
-    shared(shader_, processed_vertices_all_thread) firstprivate(model)
-  {
-    int thread_id = omp_get_thread_num();
-    std::vector<Vertex> &processedVertices_per_thread =
-        processed_vertices_all_thread[thread_id];
-
-#pragma omp for
-    for (const auto &v : model.GetVertices()) {
-      auto vertex = shader_->VertexShader(v);
-      processedVertices_per_thread.push_back(vertex);
-    }
-  }
-
-  for (const auto &processedVertices_per_thread :
-       processed_vertices_all_thread) {
-    processedVertices.insert(processedVertices.end(),
-                             processedVertices_per_thread.begin(),
-                             processedVertices_per_thread.end());
-  }
-  /*  *  *  *  *  *  *  */
-
-  /* * * Rasterization * * */
-  std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
-  std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
-
-  for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
-    depthBuffer_all_thread[thread_id] =
-        std::make_unique<float[]>(width_ * height_);
-    colorBuffer_all_thread[thread_id] =
-        std::make_unique<uint32_t[]>(width_ * height_);
-
-    std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
-                std::numeric_limits<float>::infinity());
-    std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
-  }
-
-#pragma omp parallel num_threads(kNProc) default(none) \ 
-  shared(processedVertices, rasterizer_, shader_, width_, height_, \
-             depthBuffer_all_thread, colorBuffer_all_thread)       \
-    firstprivate(model)
-  {
-    int thread_id = omp_get_thread_num();
-    auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
-    auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
-#pragma omp for
-    for (const auto &f : model.GetFaces()) {
-      auto v0 = processedVertices[f.GetIndex(0)];
-      auto v1 = processedVertices[f.GetIndex(1)];
-      auto v2 = processedVertices[f.GetIndex(2)];
-
-      const Material *material = &f.GetMaterial();
-
-      auto fragments = rasterizer_->Rasterize(v0, v1, v2);
-
-      for (auto &fragment : fragments) {
-        fragment.material = material;
-
-        size_t x = fragment.screen_coord[0];
-        size_t y = fragment.screen_coord[1];
-
-        if (x >= width_ || y >= height_) {
-          continue;
-        }
-
-        size_t index = x + y * width_;
-
-        if (fragment.depth < depthBuffer_per_thread[index]) {
-          depthBuffer_per_thread[index] = fragment.depth;
-
-          /* * * Fragment Shader * * */
-          auto color = shader_->FragmentShader(fragment);
-          colorBuffer_per_thread[index] = uint32_t(color);
-        }
-      }
-    }
-  }
-
-  // Merge
-  std::unique_ptr<float[]> depthBuffer =
-      std::make_unique<float[]>(width_ * height_);
-  std::unique_ptr<uint32_t[]> colorBuffer =
-      std::make_unique<uint32_t[]>(width_ * height_);
-
-  std::fill_n(depthBuffer.get(), width_ * height_,
-              std::numeric_limits<float>::infinity());
-  std::fill_n(colorBuffer.get(), width_ * height_, 0);
-
-#pragma omp parallel for
-  for (size_t i = 0; i < width_ * height_; i++) {
-    float min_depth = std::numeric_limits<float>::infinity();
-    uint32_t color = 0;
-
-    for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
-      float depth = depthBuffer_all_thread[thread_id][i];
-      if (depth < min_depth) {
-        min_depth = depth;
-        color = colorBuffer_all_thread[thread_id][i];
-      }
-    }
-    depthBuffer[i] = min_depth;
-    colorBuffer[i] = color;
-  }
-
-  std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
+void SimpleRenderer::SetRenderingMode(RenderingMode mode) {
+  current_mode_ = mode;
+  SPDLOG_INFO("rendering mode set to: {}", RenderingModeToString(mode));
+  renderer_.reset();
+  EnsureRenderer();
 }
 
-/*
-Organizes processing to simulate how OpenGL works with GPUs by collecting all
-fragments per pixel before processing, closely mimicking the GPU pipeline but
-leading to increased memory usage and slower performance.
-
-组织处理方式模拟 OpenGL 在 GPU
-上的工作原理，先收集每个像素的所有片段再并行处理屏幕上的每个像素，模仿 GPU
-管线，但导致内存使用增加和渲染速度变慢
-*/
-void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) {
-  SPDLOG_INFO("draw {}", model.GetModelPath());
-
-  /* * * Vertex Shader * * */
-  std::vector<Vertex> processedVertex;
-  std::vector<std::vector<Vertex>> processed_vertices_per_thread(kNProc);
-#pragma omp parallel num_threads(kNProc) default(none) \
-    shared(shader_, processed_vertices_per_thread) firstprivate(model)
-  {
-    int thread_id = omp_get_thread_num();
-    std::vector<Vertex> &local_vertices =
-        processed_vertices_per_thread[thread_id];
-
-#pragma omp for
-    for (const auto &v : model.GetVertices()) {
-      /* * * Vertex Shader * *  */
-      auto vertex = shader_->VertexShader(v);
-      local_vertices.push_back(vertex);
-    }
-  }
-
-  for (const auto &local_vertices : processed_vertices_per_thread) {
-    processedVertex.insert(processedVertex.end(), local_vertices.begin(),
-                           local_vertices.end());
-  }
-  /*  *  *  *  *  *  *  */
-
-  /* * * Rasterization * * */
-  std::vector<std::vector<std::vector<Fragment>>> fragmentsBuffer_all_thread(
-      kNProc, std::vector<std::vector<Fragment>>(width_ * height_));
-
-#pragma omp parallel num_threads(kNProc) default(none)                       \
-    shared(processedVertex, fragmentsBuffer_all_thread, rasterizer_, width_, \
-               height_) firstprivate(model)
-  {
-    int thread_id = omp_get_thread_num();
-    auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id];
-
-#pragma omp for
-    for (const auto &f : model.GetFaces()) {
-      auto v0 = processedVertex[f.GetIndex(0)];
-      auto v1 = processedVertex[f.GetIndex(1)];
-      auto v2 = processedVertex[f.GetIndex(2)];
-
-      const Material *material = &f.GetMaterial();
-
-      auto fragments = rasterizer_->Rasterize(v0, v1, v2);
-
-      for (auto &fragment : fragments) {
-        fragment.material = material;
-
-        size_t x = fragment.screen_coord[0];
-        size_t y = fragment.screen_coord[1];
+RenderingMode SimpleRenderer::GetRenderingMode() const { return current_mode_; }
 
-        if (x >= width_ || y >= height_) {
-          continue;
-        }
-
-        size_t index = x + y * width_;
-        fragmentsBuffer_per_thread[index].push_back(fragment);
-      }
-    }
+void SimpleRenderer::SetEarlyZEnabled(bool enabled) {
+  tbr_early_z_ = enabled;
+  if (current_mode_ == RenderingMode::TILE_BASED) {
+    renderer_.reset();
+    EnsureRenderer();
   }
+}
 
-  // Merge fragments
-  std::vector<std::vector<Fragment>> fragmentsBuffer(width_ * height_);
-  for (const auto &fragmentsBuffer_per_thread : fragmentsBuffer_all_thread) {
-    for (size_t i = 0; i < fragmentsBuffer_per_thread.size(); i++) {
-      fragmentsBuffer[i].insert(fragmentsBuffer[i].end(),
-                                fragmentsBuffer_per_thread[i].begin(),
-                                fragmentsBuffer_per_thread[i].end());
-    }
+void SimpleRenderer::SetTileSize(size_t tile_size) {
+  tbr_tile_size_ = tile_size;
+  if (current_mode_ == RenderingMode::TILE_BASED) {
+    renderer_.reset();
+    EnsureRenderer();
   }
-/*  *  *  *  *  *  *  */
+}
 
-/* * * Fragment Shader * * */
-#pragma omp parallel for
-  for (size_t i = 0; i < fragmentsBuffer.size(); i++) {
-    const auto &fragments = fragmentsBuffer[i];
-    if (fragments.empty()) {
-      continue;
+void SimpleRenderer::EnsureRenderer() {
+  if (renderer_) return;
+  switch (current_mode_) { // 延迟初始化，根据模式创建相应实例
+    case RenderingMode::PER_TRIANGLE: {
+      auto r = std::make_unique<PerTriangleRenderer>(width_, height_);
+      renderer_ = std::move(r);
+      break;
     }
-
-    const Fragment *renderFragment = nullptr;
-    for (const auto &fragment : fragments) {
-      if (!renderFragment || fragment.depth < renderFragment->depth) {
-        renderFragment = &fragment;
-      }
+    case RenderingMode::TILE_BASED: {
+      auto r = std::make_unique<TileBasedRenderer>(width_, height_, tbr_early_z_, tbr_tile_size_);
+      renderer_ = std::move(r);
+      break;
     }
-
-    if (renderFragment) {
-      auto color = shader_->FragmentShader(*renderFragment);
-      buffer[i] = uint32_t(color);
+    case RenderingMode::DEFERRED: {
+      auto r = std::make_unique<DeferredRenderer>(width_, height_);
+      renderer_ = std::move(r);
+      break;
     }
   }
-  /*  *  *  *  *  *  *  */
 }
 
 }  // namespace simple_renderer
diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp
new file mode 100644
index 0000000..523fe20
--- /dev/null
+++ b/src/renderers/deferred_renderer.cpp
@@ -0,0 +1,177 @@
+#include "renderers/deferred_renderer.hpp"
+
+#include <omp.h>
+#include <algorithm>
+#include <chrono>
+#include <cassert>
+#include <iterator>
+
+#include "config.h"
+#include "log_system.h"
+
+namespace simple_renderer {
+
+bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint32_t* buffer) {
+  auto total_start_time = std::chrono::high_resolution_clock::now();
+  auto shader = std::make_shared<Shader>(shader_in);
+  shader->PrepareUniformCaches();
+
+  // 顶点变换（AoS）
+  auto vertex_start = std::chrono::high_resolution_clock::now();
+  const auto &input_vertices = model.GetVertices();
+  std::vector<Vertex> processedVertices(input_vertices.size());
+#pragma omp parallel for num_threads(kNProc) schedule(static) \
+    shared(shader, processedVertices, input_vertices)
+  for (size_t i = 0; i < input_vertices.size(); ++i) {
+    const auto &v = input_vertices[i];
+    auto clipSpaceVertex = shader->VertexShader(v);
+    auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+    auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+    processedVertices[i] = screenSpaceVertex;
+  }
+  auto vertex_end = std::chrono::high_resolution_clock::now();
+  auto vertex_ms = std::chrono::duration_cast<std::chrono::microseconds>(vertex_end - vertex_start).count() / 1000.0;
+
+  // Buffer allocation
+  auto buffer_alloc_start = std::chrono::high_resolution_clock::now();
+  std::vector<std::vector<std::vector<Fragment>>> fragmentsBuffer_all_thread(
+      kNProc, std::vector<std::vector<Fragment>>(width_ * height_));
+
+  std::vector<Material> material_cache;
+  material_cache.reserve(model.GetFaces().size());
+  for (const auto &f : model.GetFaces()) {
+    material_cache.emplace_back(f.GetMaterial());
+  }
+  auto buffer_alloc_end = std::chrono::high_resolution_clock::now();
+  auto buffer_alloc_ms = std::chrono::duration_cast<std::chrono::microseconds>(buffer_alloc_end - buffer_alloc_start).count() / 1000.0;
+
+  // Rasterization: collect fragments per pixel per thread
+  auto raster_start = std::chrono::high_resolution_clock::now();
+#pragma omp parallel num_threads(kNProc) default(none)                       \
+  shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \
+               height_, material_cache, model)
+  {
+    int thread_id = omp_get_thread_num();
+    auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id];
+
+#pragma omp for
+    for (size_t face_idx = 0; face_idx < model.GetFaces().size(); ++face_idx) {
+      const auto &f = model.GetFaces()[face_idx];
+      auto v0 = processedVertices[f.GetIndex(0)];
+      auto v1 = processedVertices[f.GetIndex(1)];
+      auto v2 = processedVertices[f.GetIndex(2)];
+
+      const Material *material = &material_cache[face_idx]; // 使用缓存的Material
+      auto fragments = rasterizer_->Rasterize(v0, v1, v2);
+
+      for (auto &fragment : fragments) {
+        fragment.material = material;
+        size_t x = fragment.screen_coord[0];
+        size_t y = fragment.screen_coord[1];
+
+        if (x >= width_ || y >= height_) continue;
+        size_t index = x + y * width_;
+        fragmentsBuffer_per_thread[index].push_back(fragment);
+      }
+    }
+  }
+  auto raster_end = std::chrono::high_resolution_clock::now();
+  auto raster_ms = std::chrono::duration_cast<std::chrono::microseconds>(raster_end - raster_start).count() / 1000.0;
+
+  /* * * Fragment Collection * * */
+  auto collect_start = std::chrono::high_resolution_clock::now();
+
+  const size_t pixel_count = static_cast<size_t>(width_) * static_cast<size_t>(height_);
+
+#ifndef NDEBUG
+  for (const auto &tb : fragmentsBuffer_all_thread) {
+    // 断言避免越界，确保固定维度
+    assert(tb.size() == pixel_count && "thread buffer size mismatch");
+  }
+#endif
+
+  // Pass 1: 统计每个像素桶的总片元数
+  std::vector<size_t> bucket_total(pixel_count, 0);
+  for (const auto &tb : fragmentsBuffer_all_thread) {
+    for (size_t i = 0; i < pixel_count; ++i) {
+      bucket_total[i] += tb[i].size();
+    }
+  }
+
+  // Pass 2: 统一预分配
+  std::vector<std::vector<Fragment>> fragmentsBuffer(pixel_count);
+  for (size_t i = 0; i < pixel_count; ++i) {
+    if (bucket_total[i] > 0) fragmentsBuffer[i].reserve(bucket_total[i]);
+  }
+
+  // Pass 3: 按桶并行合并（每个桶内部保持按线程序的插入顺序）
+#pragma omp parallel for num_threads(kNProc) schedule(static)
+  for (long long i = 0; i < static_cast<long long>(pixel_count); ++i) {
+    auto &dst = fragmentsBuffer[static_cast<size_t>(i)];
+    for (size_t t = 0; t < fragmentsBuffer_all_thread.size(); ++t) {
+      auto &src = fragmentsBuffer_all_thread[t][static_cast<size_t>(i)];
+      dst.insert(dst.end(),
+                 std::make_move_iterator(src.begin()),
+                 std::make_move_iterator(src.end()));
+      src.clear();
+    }
+  }
+  auto collect_end = std::chrono::high_resolution_clock::now();
+  auto collect_ms = std::chrono::duration_cast<std::chrono::microseconds>(collect_end - collect_start).count() / 1000.0;
+
+  /* * * Fragment Merge & Deferred Shading * * */
+  auto merge_start = std::chrono::high_resolution_clock::now();
+
+  // Fragment Merge阶段：深度测试选择最近片段
+  std::vector<const Fragment*> selected_fragments(width_ * height_, nullptr);
+#pragma omp parallel for
+  for (size_t i = 0; i < fragmentsBuffer.size(); i++) {
+    const auto &fragments = fragmentsBuffer[i];
+    if (fragments.empty()) continue;
+    const Fragment *renderFragment = nullptr;
+    for (const auto &fragment : fragments) {
+      if (!renderFragment || fragment.depth < renderFragment->depth) {
+        renderFragment = &fragment;
+      }
+    }
+    selected_fragments[i] = renderFragment;
+  }
+  auto merge_end = std::chrono::high_resolution_clock::now();
+  auto merge_ms = std::chrono::duration_cast<std::chrono::microseconds>(merge_end - merge_start).count() / 1000.0;
+
+  // Deferred Shading阶段：对选择的片段执行片段着色
+  auto shade_start = std::chrono::high_resolution_clock::now();
+#pragma omp parallel for
+  for (size_t i = 0; i < selected_fragments.size(); i++) {
+    const Fragment *renderFragment = selected_fragments[i];
+    if (renderFragment) {
+      // 添加Material指针有效性检查
+      if (renderFragment->material == nullptr) {
+        SPDLOG_ERROR("Fragment material is nullptr at pixel {}", i);
+        continue;
+      }
+      auto color = shader->FragmentShader(*renderFragment);
+      buffer[i] = uint32_t(color);
+    }
+  }
+  auto shade_end = std::chrono::high_resolution_clock::now();
+  auto shade_ms = std::chrono::duration_cast<std::chrono::microseconds>(shade_end - shade_start).count() / 1000.0;
+
+  auto total_end_time = std::chrono::high_resolution_clock::now();
+  double total_ms = std::chrono::duration_cast<std::chrono::microseconds>(total_end_time - total_start_time).count() / 1000.0;
+
+  SPDLOG_DEBUG("=== DEFERRED RENDERING PERFORMANCE ===");
+  double sum_ms = vertex_ms + (total_ms - vertex_ms);
+  SPDLOG_DEBUG("Vertex Shader:        {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/sum_ms*100);
+  SPDLOG_DEBUG("Buffer Alloc:         {:8.3f} ms", buffer_alloc_ms);
+  SPDLOG_DEBUG("Rasterization:        {:8.3f} ms", raster_ms);
+  SPDLOG_DEBUG("Fragment Collection:  {:8.3f} ms", collect_ms);
+  SPDLOG_DEBUG("Fragment Merge:       {:8.3f} ms", merge_ms);
+  SPDLOG_DEBUG("Deferred Shading:     {:8.3f} ms", shade_ms);
+  SPDLOG_DEBUG("Total:                {:8.3f} ms", vertex_ms + (buffer_alloc_ms + raster_ms + collect_ms + merge_ms + shade_ms));
+  SPDLOG_DEBUG("=========================================");
+
+  return true;
+}
+
+}  // namespace simple_renderer
diff --git a/src/renderers/per_triangle_renderer.cpp b/src/renderers/per_triangle_renderer.cpp
new file mode 100644
index 0000000..9348594
--- /dev/null
+++ b/src/renderers/per_triangle_renderer.cpp
@@ -0,0 +1,173 @@
+#include "renderers/per_triangle_renderer.hpp"
+
+#include <omp.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <limits>
+#include <memory>
+
+#include "config.h"
+#include "log_system.h"
+
+namespace simple_renderer {
+
+bool PerTriangleRenderer::Render(const Model &model, const Shader &shader_in,
+                                 uint32_t *buffer) {
+  auto total_start_time = std::chrono::high_resolution_clock::now();
+
+  // 复制 shader 以便在多线程中共享
+  auto shader = std::make_shared<Shader>(shader_in);
+  shader->PrepareUniformCaches();
+
+  // 顶点变换（AoS）
+  auto vertex_start = std::chrono::high_resolution_clock::now();
+  const auto &input_vertices = model.GetVertices();
+  std::vector<Vertex> processedVertices(input_vertices.size());
+
+#pragma omp parallel for num_threads(kNProc) schedule(static) \
+    shared(shader, processedVertices, input_vertices)
+  for (size_t i = 0; i < input_vertices.size(); ++i) {
+    const auto &v = input_vertices[i];
+    auto clipSpaceVertex = shader->VertexShader(v);
+    auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+    auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+    processedVertices[i] = screenSpaceVertex;
+  }
+  auto vertex_end = std::chrono::high_resolution_clock::now();
+  auto vertex_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                       vertex_end - vertex_start)
+                       .count() /
+                   1000.0;
+
+  // 1. 为每个线程创建framebuffer
+  auto buffer_alloc_start = std::chrono::high_resolution_clock::now();
+  std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
+  std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
+
+  for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+    depthBuffer_all_thread[thread_id] =
+        std::make_unique<float[]>(width_ * height_);
+    colorBuffer_all_thread[thread_id] =
+        std::make_unique<uint32_t[]>(width_ * height_);
+    std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
+                std::numeric_limits<float>::infinity());
+    std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
+  }
+  auto buffer_alloc_end = std::chrono::high_resolution_clock::now();
+  auto buffer_alloc_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                             buffer_alloc_end - buffer_alloc_start)
+                             .count() /
+                         1000.0;
+
+  // 2. 并行光栅化
+  auto raster_start = std::chrono::high_resolution_clock::now();
+#pragma omp parallel num_threads(kNProc) default(none)              \
+    shared(processedVertices, shader, rasterizer_, width_, height_, \
+               depthBuffer_all_thread, colorBuffer_all_thread, model)
+  {
+    int thread_id = omp_get_thread_num();
+    auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
+    auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
+
+#pragma omp for
+    for (const auto &f : model.GetFaces()) {
+      auto v0 = processedVertices[f.GetIndex(0)];
+      auto v1 = processedVertices[f.GetIndex(1)];
+      auto v2 = processedVertices[f.GetIndex(2)];
+
+      // 背面剔除（屏幕空间叉积）
+      Vector2f screen0(v0.GetPosition().x, v0.GetPosition().y);
+      Vector2f screen1(v1.GetPosition().x, v1.GetPosition().y);
+      Vector2f screen2(v2.GetPosition().x, v2.GetPosition().y);
+
+      // 计算屏幕空间叉积判断朝向
+      Vector2f edge1 = screen1 - screen0;
+      Vector2f edge2 = screen2 - screen0;
+
+      // 背面剔除：NDC空间中叉积为负表示顺时针，即背面。
+      // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
+      float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
+      if (cross_product > 0.0f) {
+        continue;  // 背面
+      }
+
+      const Material *material = &f.GetMaterial();
+      auto fragments = rasterizer_->Rasterize(v0, v1, v2);
+
+      for (auto &fragment : fragments) {
+        fragment.material = material;
+        size_t x = fragment.screen_coord[0];
+        size_t y = fragment.screen_coord[1];
+        if (x >= width_ || y >= height_) {
+          continue;
+        }
+        size_t index = x + y * width_;
+        if (fragment.depth < depthBuffer_per_thread[index]) {
+          depthBuffer_per_thread[index] = fragment.depth;
+          auto color = shader->FragmentShader(fragment);
+          colorBuffer_per_thread[index] = uint32_t(color);
+        }
+      }
+    }
+  }
+  auto raster_end = std::chrono::high_resolution_clock::now();
+  auto raster_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                       raster_end - raster_start)
+                       .count() /
+                   1000.0;
+
+  // 3. 合并结果
+  auto merge_start = std::chrono::high_resolution_clock::now();
+  std::unique_ptr<float[]> depthBuffer =
+      std::make_unique<float[]>(width_ * height_);
+  std::unique_ptr<uint32_t[]> colorBuffer =
+      std::make_unique<uint32_t[]>(width_ * height_);
+  std::fill_n(depthBuffer.get(), width_ * height_,
+              std::numeric_limits<float>::infinity());
+  std::fill_n(colorBuffer.get(), width_ * height_, 0);
+
+#pragma omp parallel for
+  for (size_t i = 0; i < width_ * height_; i++) {
+    float min_depth = std::numeric_limits<float>::infinity();
+    uint32_t color = 0;
+    for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+      float depth = depthBuffer_all_thread[thread_id][i];
+      if (depth < min_depth) {
+        min_depth = depth;
+        color = colorBuffer_all_thread[thread_id][i];
+      }
+    }
+    depthBuffer[i] = min_depth;
+    colorBuffer[i] = color;
+  }
+
+  std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
+  auto merge_end = std::chrono::high_resolution_clock::now();
+  auto merge_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                      merge_end - merge_start)
+                      .count() /
+                  1000.0;
+
+  auto total_end_time = std::chrono::high_resolution_clock::now();
+  auto total_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                      total_end_time - total_start_time)
+                      .count() /
+                  1000.0;
+
+  SPDLOG_DEBUG("=== PER-TRIANGLE RENDERING PERFORMANCE ===");
+  double sum_ms = vertex_ms + (total_ms - vertex_ms);
+  SPDLOG_DEBUG("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms,
+              vertex_ms / sum_ms * 100);
+  SPDLOG_DEBUG("Buffer Alloc:     {:8.3f} ms", buffer_alloc_ms);
+  SPDLOG_DEBUG("Rasterization:    {:8.3f} ms", raster_ms);
+  SPDLOG_DEBUG("Merge:            {:8.3f} ms", merge_ms);
+  SPDLOG_DEBUG("Total:            {:8.3f} ms",
+              vertex_ms + (buffer_alloc_ms + raster_ms + merge_ms));
+  SPDLOG_DEBUG("==========================================");
+
+  return true;
+}
+
+}  // namespace simple_renderer
diff --git a/src/renderers/renderer_base.cpp b/src/renderers/renderer_base.cpp
new file mode 100644
index 0000000..5a82e5a
--- /dev/null
+++ b/src/renderers/renderer_base.cpp
@@ -0,0 +1,44 @@
+#include "renderers/renderer_base.hpp"
+
+#include <algorithm>
+
+namespace simple_renderer {
+
+Vertex RendererBase::PerspectiveDivision(const Vertex &vertex) {
+  Vector4f position = vertex.GetPosition();
+
+  if (position.w <= kMinWValue) {
+    Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f);
+    return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
+  }
+
+  float original_w = position.w;
+  Vector4f ndcPosition(
+      position.x / position.w,  // x_ndc = x_clip / w_clip
+      position.y / position.w,  // y_ndc = y_clip / w_clip
+      position.z / position.w,  // z_ndc = z_clip / w_clip
+      1.0f / original_w         // 保存1/w用于透视矫正插值
+  );
+
+  ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f);
+  return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition());
+}
+
+Vertex RendererBase::ViewportTransformation(const Vertex &vertex) {
+  Vector4f ndcPosition = vertex.GetPosition();
+
+  // 视口变换：将NDC坐标[-1,1]转换为屏幕坐标[0,width]x[0,height]
+  float screen_x = (ndcPosition.x + 1.0f) * width_ / 2.0f;
+  float screen_y = (1.0f - ndcPosition.y) * height_ / 2.0f;
+
+  Vector4f screenPosition(
+      screen_x,
+      screen_y,
+      ndcPosition.z,
+      ndcPosition.w);
+
+  return Vertex(screenPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
+}
+
+}  // namespace simple_renderer
+
diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp
new file mode 100644
index 0000000..e39526e
--- /dev/null
+++ b/src/renderers/tile_based_renderer.cpp
@@ -0,0 +1,535 @@
+#include "renderers/tile_based_renderer.hpp"
+
+#include <omp.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <limits>
+#include <cmath>
+
+#include "config.h"
+#include "log_system.h"
+
+namespace simple_renderer {
+
+bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
+                               uint32_t *buffer) {
+  auto total_start_time = std::chrono::high_resolution_clock::now();
+  auto shader = std::make_shared<Shader>(shader_in);
+  shader->PrepareUniformCaches();
+
+  // 顶点变换（SoA）
+  auto vertex_start = std::chrono::high_resolution_clock::now();
+  const auto &input_vertices = model.GetVertices();
+  VertexSoA soa;
+  soa.resize(input_vertices.size());
+
+#pragma omp parallel for num_threads(kNProc) schedule(static) \
+    shared(shader, soa, input_vertices)
+  for (size_t i = 0; i < input_vertices.size(); ++i) {
+    const auto &v = input_vertices[i];
+    auto clipSpaceVertex = shader->VertexShader(v);
+    soa.pos_clip[i] = clipSpaceVertex.GetPosition();
+    auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+    auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+    soa.pos_screen[i] = screenSpaceVertex.GetPosition();
+    soa.normal[i] = screenSpaceVertex.GetNormal();
+    soa.uv[i] = screenSpaceVertex.GetTexCoords();
+    soa.color[i] = screenSpaceVertex.GetColor();
+  }
+  auto vertex_end = std::chrono::high_resolution_clock::now();
+  auto vertex_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                       vertex_end - vertex_start)
+                       .count() /
+                   1000.0;
+
+  // 1. Setup
+  auto setup_start = std::chrono::high_resolution_clock::now();
+  const size_t TILE_SIZE = tile_size_ > 0 ? tile_size_ : 64;
+  const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE;
+  const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE;
+  const size_t total_tiles = tiles_x * tiles_y;
+
+  // 为每个tile创建三角形列表（SoA 引用）
+  std::vector<std::vector<TileTriangleRef>> tile_triangles(total_tiles);
+  auto setup_end = std::chrono::high_resolution_clock::now();
+  auto setup_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                      setup_end - setup_start)
+                      .count() /
+                  1000.0;
+
+  // 2. Binning
+  auto binning_start = std::chrono::high_resolution_clock::now();
+  TileGridContext grid_ctx{soa, tiles_x, tiles_y, TILE_SIZE};
+  TriangleTileBinning(model, grid_ctx, tile_triangles);
+  auto binning_end = std::chrono::high_resolution_clock::now();
+  auto binning_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                        binning_end - binning_start)
+                        .count() /
+                    1000.0;
+
+  // 3. 单份全局 framebuffer
+  // 直接让每个 tile 写入这份全局缓冲区，避免末端 O(W*H*kNProc) 合并开销
+
+  auto buffer_alloc_start = std::chrono::high_resolution_clock::now();
+  std::unique_ptr<float[]> depthBuffer =
+      std::make_unique<float[]>(width_ * height_);
+  std::unique_ptr<uint32_t[]> colorBuffer =
+      std::make_unique<uint32_t[]>(width_ * height_);
+
+  // 深度初始化为最远值，颜色清零
+  std::fill_n(depthBuffer.get(), width_ * height_, kDepthClear);
+  std::fill_n(colorBuffer.get(), width_ * height_, kColorClear);
+  auto buffer_alloc_end = std::chrono::high_resolution_clock::now();
+  auto buffer_alloc_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                             buffer_alloc_end - buffer_alloc_start)
+                             .count() /
+                         1000.0;
+
+  // 4. 并行光栅化每个 tile（SoA + early-z）
+  auto raster_start = std::chrono::high_resolution_clock::now();
+  std::vector<TileMaskStats> tile_stats(total_tiles);
+#pragma omp parallel num_threads(kNProc) default(none)                        \
+    shared(tile_triangles, shader, depthBuffer, colorBuffer, total_tiles,     \
+               grid_ctx, early_z_, tile_stats)
+  {
+    // 为每个 tile 分配局部深度和颜色缓冲
+    std::unique_ptr<float[]> tile_depth_buffer =
+        std::make_unique<float[]>(grid_ctx.tile_size * grid_ctx.tile_size);
+    std::unique_ptr<uint32_t[]> tile_color_buffer =
+        std::make_unique<uint32_t[]>(grid_ctx.tile_size * grid_ctx.tile_size);
+
+    // 为每个 tile 分配可复用片段临时容器，容量按单 tile 上限预估
+    std::vector<Fragment> scratch_fragments;
+    scratch_fragments.reserve(grid_ctx.tile_size * grid_ctx.tile_size);
+
+#pragma omp for schedule(static)
+    for (size_t tile_id = 0; tile_id < total_tiles; ++tile_id) {
+      // 按照 tile 进行光栅化（SoA）
+      // 直接写入单份全局 framebuffer；不同 tile 不重叠，无需加锁
+      RasterizeTile(tile_id, tile_triangles[tile_id], grid_ctx,
+                    tile_depth_buffer.get(), tile_color_buffer.get(),
+                    depthBuffer, colorBuffer, *shader, early_z_,
+                    &scratch_fragments, &tile_stats[tile_id]);
+    }
+  }
+  auto raster_end = std::chrono::high_resolution_clock::now();
+  auto raster_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                       raster_end - raster_start)
+                       .count() /
+                   1000.0;
+
+  // 汇总并打印掩码收益统计
+  uint64_t sum_tested = 0, sum_covered = 0, sum_zpass = 0, sum_shaded = 0;
+  for (const auto& s : tile_stats) {
+    sum_tested += s.tested;
+    sum_covered += s.covered;
+    sum_zpass   += s.zpass;
+    sum_shaded  += s.shaded;
+  }
+  auto rate = [](uint64_t num, uint64_t den) -> double {
+    if (den == 0) return 0.0; return double(num) / double(den) * 100.0;
+  };
+  SPDLOG_DEBUG(
+      "TBR Mask Stats: tested={}, covered={} ({:.1f}%), zpass={} ({:.1f}%), shaded={} ({:.1f}%)",
+      sum_tested, sum_covered, rate(sum_covered, sum_tested),
+      sum_zpass, rate(sum_zpass, sum_covered),
+      sum_shaded, rate(sum_shaded, sum_covered));
+
+  // 5. 直接将单份全局 colorBuffer 拷贝到输出
+  auto present_start = std::chrono::high_resolution_clock::now();
+  std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
+  auto present_end = std::chrono::high_resolution_clock::now();
+  auto present_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                        present_end - present_start)
+                        .count() /
+                    1000.0;
+
+  auto total_end_time = std::chrono::high_resolution_clock::now();
+  double total_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                        total_end_time - total_start_time)
+                        .count() /
+                    1000.0;
+
+  SPDLOG_DEBUG("=== TILE-BASED RENDERING PERFORMANCE ===");
+  double sum_ms = vertex_ms + (total_ms - vertex_ms);
+  SPDLOG_DEBUG("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms,
+              vertex_ms / sum_ms * 100);
+  SPDLOG_DEBUG("Setup:            {:8.3f} ms", setup_ms);
+  SPDLOG_DEBUG("Binning:          {:8.3f} ms", binning_ms);
+  SPDLOG_DEBUG("Buffer Alloc:     {:8.3f} ms", buffer_alloc_ms);
+  SPDLOG_DEBUG("Rasterization:    {:8.3f} ms", raster_ms);
+  SPDLOG_DEBUG("Copy:             {:8.3f} ms", present_ms);
+  SPDLOG_DEBUG("Total:            {:8.3f} ms",
+              vertex_ms + (setup_ms + binning_ms + buffer_alloc_ms + raster_ms +
+                          present_ms));
+  SPDLOG_DEBUG("==========================================");
+
+  return true;
+}
+
+void TileBasedRenderer::TriangleTileBinning(
+    const Model& model,
+    const TileGridContext& grid,
+    std::vector<std::vector<TileTriangleRef>> &tile_triangles) {
+  const size_t total_triangles = model.GetFaces().size();
+
+  SPDLOG_DEBUG("Starting triangle-tile binning (SoA) for {} triangles",
+              total_triangles);
+  SPDLOG_DEBUG("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_,
+              height_, grid.tile_size, grid.tiles_x, grid.tiles_y);
+
+  std::vector<size_t> tile_counts(grid.tiles_x * grid.tiles_y, 0);
+
+  // 第一遍（count only）：计算每个tile需要容纳多少三角形
+  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
+    ProcessTriangleForTileBinning(tri_idx, true, model, grid,
+                                  tile_counts, tile_triangles);
+  }
+
+  // 预分配，避免动态扩容
+  for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) {
+    if (tile_counts[tile_id] > 0)
+      tile_triangles[tile_id].reserve(tile_counts[tile_id]);
+  }
+
+  // 第二遍（fill）：按范围填充TriangleRef
+  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
+    ProcessTriangleForTileBinning(tri_idx, false, model, grid,
+                                  tile_counts, tile_triangles);
+  }
+
+  size_t total_triangle_refs = 0;
+  size_t non_empty_tiles = 0;
+  for (const auto &tile : tile_triangles) {
+    total_triangle_refs += tile.size();
+    if (!tile.empty()) non_empty_tiles++;
+  }
+  SPDLOG_DEBUG("  (SoA) Total triangle references: {}", total_triangle_refs);
+  SPDLOG_DEBUG("  (SoA) Non-empty tiles: {}", non_empty_tiles);
+  SPDLOG_DEBUG("  (SoA) Average triangles per tile: {:.2f}",
+              total_triangle_refs > 0
+                  ? float(total_triangle_refs) / tile_triangles.size()
+                  : 0.0f);
+}
+
+void TileBasedRenderer::RasterizeTile(
+    size_t tile_id, const std::vector<TileTriangleRef> &triangles,
+    const TileGridContext& grid, float *tile_depth_buffer,
+    uint32_t *tile_color_buffer, std::unique_ptr<float[]> &global_depth_buffer,
+    std::unique_ptr<uint32_t[]> &global_color_buffer,
+    const Shader &shader, bool use_early_z,
+    std::vector<Fragment> *scratch_fragments,
+    TileMaskStats* out_stats) {
+  // 计算 tile 屏幕范围
+  size_t tile_x = tile_id % grid.tiles_x;
+  size_t tile_y = tile_id / grid.tiles_x;
+  size_t screen_x_start = tile_x * grid.tile_size;
+  size_t screen_y_start = tile_y * grid.tile_size;
+  size_t screen_x_end = std::min(screen_x_start + grid.tile_size, width_);
+  size_t screen_y_end = std::min(screen_y_start + grid.tile_size, height_);
+
+  // 初始化 tile 局部缓冲
+  size_t tile_width = screen_x_end - screen_x_start;
+  size_t tile_height = screen_y_end - screen_y_start;
+  std::fill_n(tile_depth_buffer, tile_width * tile_height, kDepthClear);
+  std::fill_n(tile_color_buffer, tile_width * tile_height, kColorClear);
+
+  // 掩码化扫描：按三角形直接写入 tile 局部缓冲，避免中间片段向量
+  constexpr int kLane = 8;  // 横向处理的像素个数（便于编译器自动向量化）
+
+  // 轻量统计：用于评估掩码收益（仅对少量tile打印DEBUG）
+  uint64_t tested_pixels = 0;
+  uint64_t covered_pixels = 0;
+  uint64_t zpass_pixels = 0;
+  uint64_t shaded_pixels = 0;
+
+  auto cross2 = [](float ax, float ay, float bx, float by) {
+    return ax * by - ay * bx;
+  };
+
+  for (const auto &tri : triangles) {
+    const auto i0 = tri.i0, i1 = tri.i1, i2 = tri.i2;
+
+    // 顶点屏幕坐标
+    const Vector4f &p0 = grid.soa.pos_screen[i0];
+    const Vector4f &p1 = grid.soa.pos_screen[i1];
+    const Vector4f &p2 = grid.soa.pos_screen[i2];
+
+    // 三角形屏幕空间 AABB，与 tile 矩形求交
+    const float tri_minx = std::min({p0.x, p1.x, p2.x});
+    const float tri_miny = std::min({p0.y, p1.y, p2.y});
+    const float tri_maxx = std::max({p0.x, p1.x, p2.x});
+    const float tri_maxy = std::max({p0.y, p1.y, p2.y});
+
+    int sx = std::max<int>(static_cast<int>(screen_x_start),
+                           static_cast<int>(std::floor(std::max(0.0f, tri_minx))));
+    int sy = std::max<int>(static_cast<int>(screen_y_start),
+                           static_cast<int>(std::floor(std::max(0.0f, tri_miny))));
+    int ex = std::min<int>(static_cast<int>(screen_x_end - 1),
+                           static_cast<int>(std::floor(std::min<float>(width_ - 1, tri_maxx))));
+    int ey = std::min<int>(static_cast<int>(screen_y_end - 1),
+                           static_cast<int>(std::floor(std::min<float>(height_ - 1, tri_maxy))));
+    if (sx > ex || sy > ey) continue;
+
+    // 边向量与有向面积
+    const float e01x = p1.x - p0.x, e01y = p1.y - p0.y;
+    const float e12x = p2.x - p1.x, e12y = p2.y - p1.y;
+    const float e20x = p0.x - p2.x, e20y = p0.y - p2.y;
+    const float area2 = cross2(e01x, e01y, p2.x - p0.x, p2.y - p0.y);
+    if (std::abs(area2) < 1e-6f) continue;  // 退化三角形
+    const bool positive = (area2 > 0.0f);
+
+    // z 与 1/w 的平面插值准备
+    const float z0 = p0.z, z1 = p1.z, z2 = p2.z;
+    const float w0_inv = 1.0f / p0.w, w1_inv = 1.0f / p1.w, w2_inv = 1.0f / p2.w;
+
+    // 行扫描
+    for (int y = sy; y <= ey; ++y) { // 行优先遍历：有利于 cache 与向量化
+      const float yf = static_cast<float>(y);
+      for (int xb = sx; xb <= ex; xb += kLane) { // 每次处理kLane个像素
+        const int lane = std::min(kLane, ex - xb + 1); // 当前需要处理的像素个数
+        const float x0f = static_cast<float>(xb); // 本块起点的x坐标
+
+        // 计算本块起点的三个边函数值与横向步长（dE/dx）
+        float E01_base = cross2(e01x, e01y, x0f - p0.x, yf - p0.y);
+        float E12_base = cross2(e12x, e12y, x0f - p1.x, yf - p1.y);
+        float E20_base = cross2(e20x, e20y, x0f - p2.x, yf - p2.y);
+        const float dE01dx = -e01y;
+        const float dE12dx = -e12y;
+        const float dE20dx = -e20y;
+
+        // ============== 构造覆盖掩码 cover mask ==============
+        unsigned mask_cover = 0u;
+        int cover_count = 0;
+        float E01[kLane], E12[kLane], E20[kLane];
+        #pragma omp simd
+        for (int j = 0; j < lane; ++j) {
+          E01[j] = E01_base + dE01dx * static_cast<float>(j);
+          E12[j] = E12_base + dE12dx * static_cast<float>(j);
+          E20[j] = E20_base + dE20dx * static_cast<float>(j);
+        }
+        for (int j = 0; j < lane; ++j) { // 内点测试，如果三角形在像素内，则将该像素加入覆盖掩码
+          bool inside = positive ? (E01[j] >= 0.0f && E12[j] >= 0.0f && E20[j] >= 0.0f)
+                                 : (E01[j] <= 0.0f && E12[j] <= 0.0f && E20[j] <= 0.0f);
+          if (inside) {
+            mask_cover |= (1u << j);
+            cover_count++;
+          }
+        }
+        tested_pixels += static_cast<uint64_t>(lane);
+        covered_pixels += static_cast<uint64_t>(cover_count);
+        if (mask_cover == 0u) continue;
+
+        // ============== 计算 z，进行early-z掩码 ==============
+        unsigned mask_zpass = 0u;
+        float zvals[kLane];
+        // 缓存校正后的重心坐标，避免着色阶段重复计算
+        float b0c_arr[kLane];
+        float b1c_arr[kLane];
+        float b2c_arr[kLane];
+        int zpass_count = 0;
+        for (int j = 0; j < lane; ++j) {
+          if (((mask_cover >> j) & 1u) == 0u) { continue; } // 如果该像素不在覆盖掩码内，则跳过
+          const float b0 = E12[j] / area2;
+          const float b1 = E20[j] / area2;
+          const float b2 = E01[j] / area2;
+          const float w_inv = b0 * w0_inv + b1 * w1_inv + b2 * w2_inv; // 透视矫正
+          const float b0c = (b0 * w0_inv) / w_inv;
+          const float b1c = (b1 * w1_inv) / w_inv;
+          const float b2c = (b2 * w2_inv) / w_inv;
+          b0c_arr[j] = b0c; b1c_arr[j] = b1c; b2c_arr[j] = b2c;
+          const float z = z0 * b0c + z1 * b1c + z2 * b2c;
+          zvals[j] = z;
+
+          const int sx_pix = xb + j;
+          const int local_x = sx_pix - static_cast<int>(screen_x_start);
+          const int local_y = y - static_cast<int>(screen_y_start);
+          const size_t idx = static_cast<size_t>(local_x + local_y * static_cast<int>(tile_width));
+          if (z < tile_depth_buffer[idx]) {
+            mask_zpass |= (1u << j);
+            zpass_count++;
+          }
+        }
+        zpass_pixels += static_cast<uint64_t>(zpass_count);
+
+        // ============== 构造最终掩码 ==============
+        unsigned mask_final = use_early_z ? (mask_cover & mask_zpass) : mask_cover;
+        if (mask_final == 0u && use_early_z) continue;
+
+        // 对掩码内像素着色并写回（非 early-z 时，先着色，再按 z 测试写入）
+        for (int j = 0; j < lane; ++j) {
+          if (((mask_final >> j) & 1u) == 0u && use_early_z) continue;
+          const int sx_pix = xb + j;
+          const int local_x = sx_pix - static_cast<int>(screen_x_start);
+          const int local_y = y - static_cast<int>(screen_y_start);
+          const size_t idx = static_cast<size_t>(local_x + local_y * static_cast<int>(tile_width));
+
+          // 计算插值属性
+          const float b0c = b0c_arr[j];
+          const float b1c = b1c_arr[j];
+          const float b2c = b2c_arr[j];
+
+          Fragment frag;
+          frag.screen_coord = {sx_pix, y};
+          frag.depth = zvals[j];
+          frag.material = tri.material;
+
+          // 法向量插值
+          const Vector3f &n0 = grid.soa.normal[i0];
+          const Vector3f &n1 = grid.soa.normal[i1];
+          const Vector3f &n2 = grid.soa.normal[i2];
+          frag.normal = n0 * b0c + n1 * b1c + n2 * b2c;
+
+          // 纹理坐标插值
+          const Vector2f &uv0 = grid.soa.uv[i0];
+          const Vector2f &uv1 = grid.soa.uv[i1];
+          const Vector2f &uv2 = grid.soa.uv[i2];
+          frag.uv = uv0 * b0c + uv1 * b1c + uv2 * b2c;
+
+          // 颜色插值
+          const Color &c0 = grid.soa.color[i0];
+          const Color &c1 = grid.soa.color[i1];
+          const Color &c2 = grid.soa.color[i2];
+          auto color_r = FloatToUint8_t(static_cast<float>(c0[Color::kColorIndexRed]) * b0c +
+                                        static_cast<float>(c1[Color::kColorIndexRed]) * b1c +
+                                        static_cast<float>(c2[Color::kColorIndexRed]) * b2c);
+          auto color_g = FloatToUint8_t(static_cast<float>(c0[Color::kColorIndexGreen]) * b0c +
+                                        static_cast<float>(c1[Color::kColorIndexGreen]) * b1c +
+                                        static_cast<float>(c2[Color::kColorIndexGreen]) * b2c);
+          auto color_b = FloatToUint8_t(static_cast<float>(c0[Color::kColorIndexBlue]) * b0c +
+                                        static_cast<float>(c1[Color::kColorIndexBlue]) * b1c +
+                                        static_cast<float>(c2[Color::kColorIndexBlue]) * b2c);
+          frag.color = Color(color_r, color_g, color_b);
+
+          if (use_early_z) { // 开启时，仅对mask中通过early-z的像素进行着色和写回
+            auto out_color = shader.FragmentShader(frag);
+            tile_depth_buffer[idx] = frag.depth;
+            tile_color_buffer[idx] = uint32_t(out_color);
+            shaded_pixels++;
+          } else {
+            // 关闭时，先着色，再按z测试写入
+            auto out_color = shader.FragmentShader(frag);
+            if (frag.depth < tile_depth_buffer[idx]) { // late-z
+              tile_depth_buffer[idx] = frag.depth;
+              tile_color_buffer[idx] = uint32_t(out_color);
+              shaded_pixels++;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (out_stats) {
+    out_stats->tested = tested_pixels;
+    out_stats->covered = covered_pixels;
+    out_stats->zpass = zpass_pixels;
+    out_stats->shaded = shaded_pixels;
+  }
+
+  // 写回全局缓冲
+  // TBR 下不同 tile 覆盖的屏幕区域互不重叠，且在 tile 内部已通过 Early‑Z
+  // 得出每个像素的最终值。因此可以直接将 tile 行数据拷贝到全局缓冲
+  for (size_t y = 0; y < tile_height; y++) {
+    const size_t tile_row_off = y * tile_width;
+    const size_t global_row_off =
+        (screen_y_start + y) * width_ + screen_x_start;
+
+    // 拷贝本行 color 到全局 color
+    std::memcpy(global_color_buffer.get() + global_row_off,
+                tile_color_buffer + tile_row_off,
+                tile_width * sizeof(uint32_t));
+
+    // 拷贝本行 depth 到全局 depth
+    std::memcpy(global_depth_buffer.get() + global_row_off,
+                tile_depth_buffer + tile_row_off, tile_width * sizeof(float));
+  }
+}
+
+void TileBasedRenderer::ProcessTriangleForTileBinning(
+    size_t tri_idx, bool count_only, const Model &model,
+    const TileGridContext &grid, std::vector<size_t> &tile_counts,
+    std::vector<std::vector<TileTriangleRef>> &tile_triangles) {
+  const auto &f = model.GetFaces()[tri_idx];
+  size_t i0 = f.GetIndex(0);
+  size_t i1 = f.GetIndex(1);
+  size_t i2 = f.GetIndex(2);
+
+  // 视锥体裁剪 (裁剪空间)
+  // 保守视锥体裁剪：只有当整个三角形都在视锥体外同一侧时才裁剪
+  const Vector4f &c0 = grid.soa.pos_clip[i0];
+  const Vector4f &c1 = grid.soa.pos_clip[i1];
+  const Vector4f &c2 = grid.soa.pos_clip[i2];
+  bool frustum_cull =
+      (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||     // 右平面外
+      (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) ||  // 左平面外
+      (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||     // 上平面外
+      (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) ||  // 下平面外
+      (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||     // 远平面外
+      (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w);    // 近平面外
+  if (frustum_cull) {
+    return;
+  }
+
+  const Vector4f &pos0 = grid.soa.pos_screen[i0];
+  const Vector4f &pos1 = grid.soa.pos_screen[i1];
+  const Vector4f &pos2 = grid.soa.pos_screen[i2];
+
+  // 背面剔除（屏幕空间）
+  // NDC空间中叉积为负表示顺时针，即背面。
+  // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
+  Vector2f screen0(pos0.x, pos0.y);
+  Vector2f screen1(pos1.x, pos1.y);
+  Vector2f screen2(pos2.x, pos2.y);
+  Vector2f edge1 = screen1 - screen0;
+  Vector2f edge2 = screen2 - screen0;
+  float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
+  if (cross_product > 0.0f) return;
+
+  float screen_x0 = pos0.x;
+  float screen_y0 = pos0.y;
+  float screen_x1 = pos1.x;
+  float screen_y1 = pos1.y;
+  float screen_x2 = pos2.x;
+  float screen_y2 = pos2.y;
+
+  // 计算屏幕bbox，用于后续tile划分
+  float min_x = std::min({screen_x0, screen_x1, screen_x2});
+  float max_x = std::max({screen_x0, screen_x1, screen_x2});
+  float min_y = std::min({screen_y0, screen_y1, screen_y2});
+  float max_y = std::max({screen_y0, screen_y1, screen_y2});
+
+  int start_tile_x = std::max(0, static_cast<int>(min_x) /
+                                     static_cast<int>(grid.tile_size));
+  int end_tile_x =
+      std::min(static_cast<int>(grid.tiles_x - 1),
+               static_cast<int>(max_x) / static_cast<int>(grid.tile_size));
+  int start_tile_y = std::max(0, static_cast<int>(min_y) /
+                                     static_cast<int>(grid.tile_size));
+  int end_tile_y =
+      std::min(static_cast<int>(grid.tiles_y - 1),
+               static_cast<int>(max_y) / static_cast<int>(grid.tile_size));
+  if (start_tile_x > end_tile_x || start_tile_y > end_tile_y)
+    return;  // 如果bbox不在任何tile内，直接返回
+
+  if (count_only) {  // 第一遍计数，只统计tile内三角形数量
+    for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
+      for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
+        size_t tile_id = ty * grid.tiles_x + tx;
+        tile_counts[tile_id]++;
+      }
+    }
+  } else {  // 第二遍填充，填充TriangleRef
+    TileTriangleRef tri_ref{i0, i1, i2, &f.GetMaterial(), tri_idx};
+    for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
+      for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
+        size_t tile_id = ty * grid.tiles_x + tx;
+        tile_triangles[tile_id].push_back(tri_ref);
+      }
+    }
+  }
+}
+
+}  // namespace simple_renderer
diff --git a/src/shader.cpp b/src/shader.cpp
index 3438627..06ab241 100644
--- a/src/shader.cpp
+++ b/src/shader.cpp
@@ -1,20 +1,261 @@
 #include "shader.hpp"
 
+#include <algorithm>
+#include <cmath>
+#include <mutex>
+#include <shared_mutex>
+
 namespace simple_renderer {
 
+Shader::Shader(const Shader& shader) {
+  std::shared_lock lock(shader.specular_cache_mutex_);
+  uniformbuffer_ = shader.uniformbuffer_;
+  sharedDataInShader_ = shader.sharedDataInShader_;
+  vertex_uniform_cache_ = shader.vertex_uniform_cache_;
+  fragment_uniform_cache_ = shader.fragment_uniform_cache_;
+  specular_lut_cache_ = shader.specular_lut_cache_;
+}
+
+Shader::Shader(Shader&& shader) noexcept {
+  std::unique_lock lock(shader.specular_cache_mutex_);
+  uniformbuffer_ = std::move(shader.uniformbuffer_);
+  sharedDataInShader_ = shader.sharedDataInShader_;
+  vertex_uniform_cache_ = shader.vertex_uniform_cache_;
+  fragment_uniform_cache_ = shader.fragment_uniform_cache_;
+  specular_lut_cache_ = std::move(shader.specular_lut_cache_);
+}
+
+auto Shader::operator=(const Shader& shader) -> Shader& {
+  if (this == &shader) {
+    return *this;
+  }
+  std::shared_lock lock(shader.specular_cache_mutex_);
+  uniformbuffer_ = shader.uniformbuffer_;
+  sharedDataInShader_ = shader.sharedDataInShader_;
+  vertex_uniform_cache_ = shader.vertex_uniform_cache_;
+  fragment_uniform_cache_ = shader.fragment_uniform_cache_;
+  specular_lut_cache_ = shader.specular_lut_cache_;
+  return *this;
+}
+
+auto Shader::operator=(Shader&& shader) noexcept -> Shader& {
+  if (this == &shader) {
+    return *this;
+  }
+  std::unique_lock lock(shader.specular_cache_mutex_);
+  uniformbuffer_ = std::move(shader.uniformbuffer_);
+  sharedDataInShader_ = shader.sharedDataInShader_;
+  vertex_uniform_cache_ = shader.vertex_uniform_cache_;
+  fragment_uniform_cache_ = shader.fragment_uniform_cache_;
+  specular_lut_cache_ = std::move(shader.specular_lut_cache_);
+  return *this;
+}
+
 Vertex Shader::VertexShader(const Vertex& vertex) {
-  Matrix4f model_matrix = uniformbuffer_.GetUniform<Matrix4f>("modelMatrix");
-  Matrix4f view_matrix = uniformbuffer_.GetUniform<Matrix4f>("viewMatrix");
-  Matrix4f projection_matrix =
-      uniformbuffer_.GetUniform<Matrix4f>("projectionMatrix");
+  const bool cache_ready = vertex_uniform_cache_.derived_valid;
+
+  const Matrix4f* model_ptr = nullptr;
+  const Matrix4f* mvp_ptr = nullptr;
+  const Matrix3f* normal_ptr = nullptr;
+
+  Matrix4f fallback_model;
+  Matrix4f fallback_mvp;
+  Matrix3f fallback_normal;
+
+  if (cache_ready) { // 如果所有派生矩阵已预计算并可直接复用
+    // 直接复用缓存矩阵，避免逐顶点哈希查询
+    model_ptr = &vertex_uniform_cache_.model;
+    mvp_ptr = &vertex_uniform_cache_.mvp;
+    normal_ptr = &vertex_uniform_cache_.normal;
+  } else { // 如果缓存尚未建立
+    fallback_model = uniformbuffer_.GetUniform<Matrix4f>("modelMatrix");
+    Matrix4f view_matrix = uniformbuffer_.GetUniform<Matrix4f>("viewMatrix");
+    Matrix4f projection_matrix =
+        uniformbuffer_.GetUniform<Matrix4f>("projectionMatrix");
+    fallback_mvp = projection_matrix * view_matrix * fallback_model;
+    fallback_normal =
+        glm::transpose(glm::inverse(Matrix3f(fallback_model)));
+    model_ptr = &fallback_model;
+    mvp_ptr = &fallback_mvp;
+    normal_ptr = &fallback_normal;
+  }
+
+  const Matrix4f& model_matrix = *model_ptr;
+  const Matrix4f& mvp_matrix = *mvp_ptr;
+  const Matrix3f& normal_matrix = *normal_ptr;
+
+  const Vector4f position = vertex.GetPosition();
+  Vector4f world_position = model_matrix * position;
+  Vector3f transformed_normal = normal_matrix * vertex.GetNormal();
+
+  // 将世界空间位置写入共享数据供片元阶段使用
+  sharedDataInShader_.fragPos_varying = Vector3f(world_position);
+
+  // 计算裁剪空间坐标
+  Vector4f clip_position = mvp_matrix * position;
+
+  // 返回变换后的顶点（包含变换后的法向量和裁剪坐标）
+  return Vertex(clip_position, transformed_normal, vertex.GetTexCoords(),
+                vertex.GetColor(),
+                clip_position);  // 同时保存裁剪空间坐标用于后续裁剪
+}
+
+void Shader::UpdateMatrixCache(const std::string& name,
+                               const Matrix4f& value) {
+  if (name == "modelMatrix") {
+    vertex_uniform_cache_.model = value;
+    vertex_uniform_cache_.has_model = true;
+  } else if (name == "viewMatrix") {
+    vertex_uniform_cache_.view = value;
+    vertex_uniform_cache_.has_view = true;
+  } else if (name == "projectionMatrix") {
+    vertex_uniform_cache_.projection = value;
+    vertex_uniform_cache_.has_projection = true;
+  } else {
+    return;
+  }
+
+  // 任一基础矩阵更新后，标记派生矩阵失效等待重算
+  vertex_uniform_cache_.derived_valid = false;
+  if (vertex_uniform_cache_.has_model && vertex_uniform_cache_.has_view &&
+      vertex_uniform_cache_.has_projection) {
+    RecalculateDerivedMatrices();
+  }
+}
+
+void Shader::RecalculateDerivedMatrices() {
+  // 预计算 Model-View、MVP 以及法线矩阵，供顶点着色器复用
+  vertex_uniform_cache_.model_view =
+      vertex_uniform_cache_.view * vertex_uniform_cache_.model;
+  vertex_uniform_cache_.mvp = vertex_uniform_cache_.projection *
+                              vertex_uniform_cache_.model_view;
+  vertex_uniform_cache_.normal = glm::transpose(glm::inverse(
+      Matrix3f(vertex_uniform_cache_.model)));
+  vertex_uniform_cache_.derived_valid = true;
+}
+
+void Shader::UpdateFragmentCache(const std::string& name,
+                                 const Light& value) {
+  if (name != "light") {
+    return;
+  }
+  fragment_uniform_cache_.light = value;
+  fragment_uniform_cache_.has_light = true;
+  fragment_uniform_cache_.derived_valid = false;
+  if (fragment_uniform_cache_.has_light && fragment_uniform_cache_.has_camera) {
+    RecalculateFragmentDerived();
+  }
+}
+
+void Shader::UpdateFragmentCache(const std::string& name,
+                                 const Vector3f& value) {
+  if (name != "cameraPos") {
+    return;
+  }
+  fragment_uniform_cache_.camera_pos = value;
+  fragment_uniform_cache_.has_camera = true;
+  fragment_uniform_cache_.derived_valid = false;
+  if (fragment_uniform_cache_.has_light && fragment_uniform_cache_.has_camera) {
+    RecalculateFragmentDerived();
+  }
+}
+
+void Shader::RecalculateFragmentDerived() {
+  fragment_uniform_cache_.light_dir_normalized =
+      glm::normalize(fragment_uniform_cache_.light.dir);
+  fragment_uniform_cache_.derived_valid = true;
+}
 
-  Matrix4f mvp_matrix = projection_matrix * view_matrix * model_matrix;
-  // auto normal_matrix = model_matrix.inverse().transpose();
+void Shader::PrepareUniformCaches() {
+  PrepareVertexUniformCache();
+  PrepareFragmentUniformCache();
+}
 
-  sharedDataInShader_.fragPos_varying =
-      Vector3f(model_matrix * vertex.GetPosition());
+void Shader::PrepareVertexUniformCache() {
+  if (vertex_uniform_cache_.derived_valid) {
+    return;
+  }
+  // 在进入渲染阶段前一次性取出常用矩阵并填充缓存
+  if (uniformbuffer_.HasUniform<Matrix4f>("modelMatrix") &&
+      uniformbuffer_.HasUniform<Matrix4f>("viewMatrix") &&
+      uniformbuffer_.HasUniform<Matrix4f>("projectionMatrix")) {
+    vertex_uniform_cache_.model =
+        uniformbuffer_.GetUniform<Matrix4f>("modelMatrix");
+    vertex_uniform_cache_.view =
+        uniformbuffer_.GetUniform<Matrix4f>("viewMatrix");
+    vertex_uniform_cache_.projection =
+        uniformbuffer_.GetUniform<Matrix4f>("projectionMatrix");
+    vertex_uniform_cache_.has_model = true;
+    vertex_uniform_cache_.has_view = true;
+    vertex_uniform_cache_.has_projection = true;
+    RecalculateDerivedMatrices();
+  }
+}
 
-  return mvp_matrix * vertex;
+void Shader::PrepareFragmentUniformCache() {
+  if (fragment_uniform_cache_.derived_valid) {
+    return;
+  }
+  if (uniformbuffer_.HasUniform<Light>("light") &&
+      uniformbuffer_.HasUniform<Vector3f>("cameraPos")) {
+    fragment_uniform_cache_.light =
+        uniformbuffer_.GetUniform<Light>("light");
+    fragment_uniform_cache_.camera_pos =
+        uniformbuffer_.GetUniform<Vector3f>("cameraPos");
+    fragment_uniform_cache_.has_light = true;
+    fragment_uniform_cache_.has_camera = true;
+    RecalculateFragmentDerived();
+  }
+}
+
+auto Shader::BuildSpecularLUT(float shininess) const -> SpecularLUT {
+  SpecularLUT lut;
+  if (shininess <= 0.0f) {
+    lut.values.fill(1.0f);
+    return lut;
+  }
+
+  for (size_t i = 0; i < kSpecularLutResolution; ++i) {
+    float cos_theta = static_cast<float>(i) /
+                      static_cast<float>(kSpecularLutResolution - 1);
+    lut.values[i] = cos_theta <= 0.0f ? 0.0f : std::pow(cos_theta, shininess);
+  }
+  return lut;
+}
+
+auto Shader::GetSpecularLUT(float shininess) const -> const SpecularLUT& {
+  uint32_t key = std::bit_cast<uint32_t>(shininess);
+  {
+    std::shared_lock lock(specular_cache_mutex_);
+    auto it = specular_lut_cache_.find(key);
+    if (it != specular_lut_cache_.end()) {
+      return it->second;
+    }
+  }
+
+  SpecularLUT lut = BuildSpecularLUT(shininess);
+  std::unique_lock lock(specular_cache_mutex_);
+  auto [it, inserted] = specular_lut_cache_.emplace(key, std::move(lut));
+  return it->second;
+}
+
+auto Shader::EvaluateSpecular(float cos_theta, float shininess) const -> float {
+  cos_theta = std::clamp(cos_theta, 0.0f, 1.0f);
+  if (shininess <= 0.0f) {
+    return 1.0f;
+  }
+  if (cos_theta <= 0.0f) {
+    return 0.0f;
+  }
+
+  const auto& lut = GetSpecularLUT(shininess);
+  float scaled = cos_theta * static_cast<float>(kSpecularLutResolution - 1);
+  size_t index = static_cast<size_t>(scaled);
+  float frac = scaled - static_cast<float>(index);
+
+  const float v0 = lut.values[index];
+  const float v1 = lut.values[std::min(index + 1, kSpecularLutResolution - 1)];
+  return v0 + (v1 - v0) * frac;
 }
 
 Color Shader::FragmentShader(const Fragment& fragment) const {
@@ -24,14 +265,23 @@ Color Shader::FragmentShader(const Fragment& fragment) const {
   Vector2f uv = fragment.uv;
 
   // uniform
-  Light light = uniformbuffer_.GetUniform<Light>("light");
+  Light light;
+  Vector3f light_dir;
+  Vector3f camera_pos;
+  if (fragment_uniform_cache_.derived_valid) {
+    light = fragment_uniform_cache_.light;
+    light_dir = fragment_uniform_cache_.light_dir_normalized;
+    camera_pos = fragment_uniform_cache_.camera_pos;
+  } else {
+    light = uniformbuffer_.GetUniform<Light>("light");
+    camera_pos = uniformbuffer_.GetUniform<Vector3f>("cameraPos");
+    light_dir = glm::normalize(light.dir);
+  }
   Material material = *fragment.material;
 
   // view direction
   Vector3f view_dir =
-      glm::normalize(sharedDataInShader_.fragPos_varying -
-                     uniformbuffer_.GetUniform<Vector3f>("cameraPos"));
-  Vector3f light_dir = glm::normalize(light.dir);
+      glm::normalize(sharedDataInShader_.fragPos_varying - camera_pos);
 
   auto intensity = std::max(glm::dot(normal, light_dir), 0.0f);
   // texture color
@@ -51,8 +301,8 @@ Color Shader::FragmentShader(const Fragment& fragment) const {
   }
 
   Vector3f halfVector = glm::normalize(light_dir + view_dir);
-  float spec = std::pow(std::max(glm::dot(normal, halfVector), 0.0f),
-                        material.shininess);
+  float cos_theta = std::max(glm::dot(normal, halfVector), 0.0f);
+  float spec = EvaluateSpecular(cos_theta, material.shininess);
   if (material.has_specular_texture) {
     Color texture_color = SampleTexture(material.specular_texture, uv);
     specular_color = texture_color * spec;
@@ -108,4 +358,4 @@ Color Shader::ClampColor(const Color color) const {
   return Color(red, green, blue, alpha);
 }
 
-}  // namespace simple_renderer
\ No newline at end of file
+}  // namespace simple_renderer
diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp
index f75b29c..d6491d9 100755
--- a/test/system_test/main.cpp
+++ b/test/system_test/main.cpp
@@ -56,21 +56,20 @@ int main(int argc, char **argv) {
   }
 
   auto modelMatrix = simple_renderer::Matrix4f(1.0f);
+  
   simple_renderer::Matrix4f scale_matrix =
       glm::scale(simple_renderer::Matrix4f(1.0f),
-                 simple_renderer::Vector3f(7.0f, 7.0f, 7.0f));
+                 simple_renderer::Vector3f(.02f, .02f, .02f));
 
-  // Translation matrix
   simple_renderer::Matrix4f translation_matrix =
       glm::translate(simple_renderer::Matrix4f(1.0f),
-                     simple_renderer::Vector3f(30.0f, 30.0f, 0.0f));
+                     simple_renderer::Vector3f(0.0f, -5.0f, 0.0f));
 
   simple_renderer::Matrix4f rotation_matrix =
-      glm::rotate(simple_renderer::Matrix4f(1.0f), 90.0f,
+      glm::rotate(simple_renderer::Matrix4f(1.0f), glm::radians(-105.0f),
                   simple_renderer::Vector3f(1.0f, 0.0f, 0.0f));
 
-  // Combined transformation matrix
-  modelMatrix = scale_matrix * translation_matrix * rotation_matrix;
+  modelMatrix = scale_matrix* translation_matrix * rotation_matrix ;
 
   simple_renderer::Shader shader;
   shader.SetUniform("modelMatrix", modelMatrix);
@@ -81,6 +80,14 @@ int main(int argc, char **argv) {
 
   simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f));
 
+  // 设置渲染模式（可选：PER_TRIANGLE、TILE_BASED 或 DEFERRED）
+  simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED);
+  
+  // 输出当前渲染模式
+  std::string current_mode_name = simple_renderer::RenderingModeToString(
+      simple_renderer.GetRenderingMode());
+  SPDLOG_INFO("当前渲染模式: {}", current_mode_name);
+
   auto display = Display(kWidth, kHeight);
   display.loopBegin();
 
@@ -90,11 +97,11 @@ int main(int argc, char **argv) {
     shader.SetUniform("cameraPos", camera.GetPosition());
     shader.SetUniform("viewMatrix", camera.GetViewMatrix());
     shader.SetUniform("projectionMatrix",
-                      camera.GetProjectionMatrix(60.0f, 1.0f, 0.1f, 100.0f));
+                      camera.GetProjectionMatrix(60.0f, static_cast<float>(kWidth) / static_cast<float>(kHeight), 0.1f, 100.0f));
 
     buffer.ClearDrawBuffer(simple_renderer::Color::kBlack);
     for (auto &model : models) {
-      simple_renderer.Render(model, shader, buffer.GetDrawBuffer());
+      simple_renderer.DrawModel(model, shader, buffer.GetDrawBuffer());
     }
 
     buffer.SwapBuffer();