From 0136489c4cfbe7070881baba87e84c10b7815d70 Mon Sep 17 00:00:00 2001
From: Jason DeBacker <jdebacker@users.noreply.github.com>
Date: Thu, 27 Mar 2025 18:36:50 -0400
Subject: [PATCH 1/5] edits to kde

---
 iot/inverse_optimal_tax.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/iot/inverse_optimal_tax.py b/iot/inverse_optimal_tax.py
index c262d05..b8556f1 100644
--- a/iot/inverse_optimal_tax.py
+++ b/iot/inverse_optimal_tax.py
@@ -274,14 +274,21 @@ def compute_income_dist(
             )
         elif dist_type == "kde":
             # uses the original full data for kde estimation
+            data2 = data.copy()
+            # for income > 500,000 replace actual values with draws from
+            # a pareto distribution
+            data2.loc[data2[income_measure] > 500_000, income_measure] = st.pareto.rvs(
+                1.16, size=data2.loc[data2[income_measure] > 500_000].shape[0]
+            )
             f_function = st.gaussian_kde(
-                data[income_measure],
-                # bw_method=kde_bw,
+                data2[income_measure],
+                bw_method=kde_bw,
                 weights=data[weight_var],
             )
-            f = f_function.pdf(z_line)
+            f = f_function(z_line)
+            f = f / np.sum(f)  # ensure sum to one
             F = np.cumsum(f)
-            f_prime = np.gradient(f, edge_order=2)
+            f_prime = np.gradient(f, z_line, edge_order=2)
         elif dist_type == "Pln":
 
             def pln_pdf(y, mu, sigma, alpha):

From 530fbe045ac58e17d070e001c870475fdfd8a2a3 Mon Sep 17 00:00:00 2001
From: Jason DeBacker <jdebacker@users.noreply.github.com>
Date: Tue, 1 Apr 2025 10:47:46 -0400
Subject: [PATCH 2/5] splice pareto

---
 iot/inverse_optimal_tax.py | 115 ++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 9 deletions(-)

diff --git a/iot/inverse_optimal_tax.py b/iot/inverse_optimal_tax.py
index b8556f1..c917059 100644
--- a/iot/inverse_optimal_tax.py
+++ b/iot/inverse_optimal_tax.py
@@ -5,6 +5,7 @@
 from statsmodels.nonparametric.kernel_regression import KernelReg
 from scipy.interpolate import UnivariateSpline
 from scipy.linalg import lstsq
+from scipy.optimize import minimize
 
 
 class IOT:
@@ -246,12 +247,8 @@ def compute_income_dist(
                 ).values
                 / data[weight_var].sum()
             ).sum()
-            # F = st.lognorm.cdf(z_line, s=(sigmasq) ** 0.5, scale=np.exp(mu))
-            # f = st.lognorm.pdf(z_line, s=(sigmasq) ** 0.5, scale=np.exp(mu))
-            # f = f / np.sum(f)
-            # f_prime = np.gradient(f, edge_order=2)
 
-            # analytical derivative of lognormal
+            # analytical expressions for F, f, f_prime
             sigma = np.sqrt(sigmasq)
             F = (1 / 2) * (
                 1
@@ -277,15 +274,115 @@ def compute_income_dist(
             data2 = data.copy()
             # for income > 500,000 replace actual values with draws from
             # a pareto distribution
-            data2.loc[data2[income_measure] > 500_000, income_measure] = st.pareto.rvs(
-                1.16, size=data2.loc[data2[income_measure] > 500_000].shape[0]
-            )
+            # data2.loc[data2[income_measure] > 500_000, income_measure] = st.pareto.rvs(
+            #     1.16, size=data2.loc[data2[income_measure] > 500_000].shape[0]
+            # )
             f_function = st.gaussian_kde(
                 data2[income_measure],
                 bw_method=kde_bw,
                 weights=data[weight_var],
             )
-            f = f_function(z_line)
+
+            # Now splice a Pareto distribution to the KDE
+            # Step 1: Define the splicing point
+            splice_point = 350_000 #500000
+            # Step 2: Evaluate the KDE at the splice point to find
+            # its value and derivative
+            kde_value_at_splice = f_function(splice_point)
+            # For the derivative, use a small delta to compute numerically
+            delta = 0.0001 * splice_point
+            kde_derivative_at_splice = (
+                f_function(splice_point + delta) -
+                f_function(splice_point - delta)) / (2 * delta)
+
+            # Step 3: Define a Pareto distribution function that matches
+            # at the splice point
+            # The Pareto PDF is: α * x_m^α / x^(α+1) where x_m is the
+            # scale parameter and α is the shape parameter
+
+            # def find_pareto_parameters():
+            #     def objective(params):
+            #         alpha, scale = params
+            #         # Calculate the Pareto PDF value at the splice point
+            #         pareto_value = alpha * (scale**alpha) / (splice_point**(alpha+1))
+            #         # Calculate the Pareto derivative at the splice point
+            #         pareto_derivative = -alpha * (alpha+1) * (scale**alpha) / (splice_point**(alpha+2))
+
+            #         # Compute the error in matching both the value and derivative
+            #         value_error = (pareto_value - kde_value_at_splice)**2
+            #         derivative_error = (pareto_derivative - kde_derivative_at_splice)**2
+
+            #         # Return the combined error
+            #         return value_error + derivative_error
+
+            #     # Initial guess for alpha and scale
+            #     # initial_guess = [2.0, splice_point/2]
+            #     initial_guess = [1.3, splice_point/5]
+
+            #     # Constraints: alpha > 0, scale > 0
+            #     bounds = [(0.1, 10), (0.1*splice_point, splice_point)]
+
+            #     # Minimize the objective function
+            #     result = minimize(
+            #         objective, initial_guess, bounds=bounds,
+            #         method='L-BFGS-B')
+            #     print("Minimization result:", result)
+            #     if not result.success:
+            #         raise ValueError("Optimization failed: " + result.message)
+            #     if result.fun > 1e-6:
+            #         raise ValueError("Optimization did not converge to a good solution.")
+            #     return result.x
+
+            # # Get the optimal Pareto parameters
+            # alpha, scale = find_pareto_parameters()
+
+            # Directly solve for alpha
+            alpha = -(kde_derivative_at_splice * splice_point / kde_value_at_splice) - 1
+
+            # Now solve for scale using the first equation:
+            # kde_value = alpha * (scale^alpha) / (splice_point^(alpha+1))
+            # scale^alpha = kde_value * splice_point^(alpha+1) / alpha
+            scale = (kde_value_at_splice * (splice_point**(alpha+1)) / alpha)**(1/alpha)
+
+            print(f"Calculated alpha: {alpha}")
+            print(f"Calculated scale: {scale}")
+
+            # Verify the Pareto value at the splice point
+            pareto_value = alpha * (scale**alpha) / (splice_point**(alpha+1))
+            pareto_derivative = -alpha * (alpha+1) * (scale**alpha) / (splice_point**(alpha+2))
+
+            print(f"KDE value at splice: {kde_value_at_splice}")
+            print(f"Pareto value at splice: {pareto_value}")
+            print(f"Difference: {pareto_value - kde_value_at_splice}")
+
+            print(f"KDE derivative at splice: {kde_derivative_at_splice}")
+            print(f"Pareto derivative at splice: {pareto_derivative}")
+            print(f"Derivative difference: {pareto_derivative - kde_derivative_at_splice}")
+
+            # Step 4: Define the spliced PDF function
+            def spliced_pdf(x):
+                if np.isscalar(x):
+                    if x <= splice_point:
+                        return f_function(x)
+                    else:
+                        return alpha * (scale**alpha) / (x**(alpha+1))
+                else:
+                    # Handle array inputs
+                    x = np.asarray(x)
+                    result = np.zeros_like(x, dtype=float)
+                    kde_mask = x <= splice_point
+                    pareto_mask = x > splice_point
+
+                    if np.any(kde_mask):
+                        result[kde_mask] = f_function(x[kde_mask])
+                    if np.any(pareto_mask):
+                        result[pareto_mask] = alpha * (scale**alpha) / (x[pareto_mask]**(alpha+1))
+                    return result
+
+
+
+            # f = f_function(z_line)
+            f = spliced_pdf(z_line)
             f = f / np.sum(f)  # ensure sum to one
             F = np.cumsum(f)
             f_prime = np.gradient(f, z_line, edge_order=2)

From 107779c567e61ce0b8a44c5c59cca7bd6c88a2df Mon Sep 17 00:00:00 2001
From: Jason DeBacker <jdebacker@users.noreply.github.com>
Date: Tue, 1 Apr 2025 10:48:05 -0400
Subject: [PATCH 3/5] start g_z at ndex 0 regardless of method

---
 iot/iot_user.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/iot/iot_user.py b/iot/iot_user.py
index e392d58..c11109c 100644
--- a/iot/iot_user.py
+++ b/iot/iot_user.py
@@ -146,10 +146,10 @@ def plot(self, var="g_z"):
                 + "=%{y:.3f}<extra></extra>"
             )
         else:
-            if var == "g_z_numerical":
-                start_idx = 10  # numerical approximation not great near 0
-            else:
-                start_idx = 0
+            # if var == "g_z_numerical":
+            #     start_idx = 0  # numerical approximation not great near 0
+            # else:
+            start_idx = 0
             y = []
             for i in self.iot:
                 y.append(i.df()[var][start_idx:])

From 0534bde206e3201e59858bf9f7a55dfcd0429389 Mon Sep 17 00:00:00 2001
From: Jason DeBacker <jdebacker@users.noreply.github.com>
Date: Tue, 1 Apr 2025 10:54:30 -0400
Subject: [PATCH 4/5] format

---
 iot/inverse_optimal_tax.py | 45 ++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/iot/inverse_optimal_tax.py b/iot/inverse_optimal_tax.py
index c917059..c7a19f6 100644
--- a/iot/inverse_optimal_tax.py
+++ b/iot/inverse_optimal_tax.py
@@ -285,15 +285,16 @@ def compute_income_dist(
 
             # Now splice a Pareto distribution to the KDE
             # Step 1: Define the splicing point
-            splice_point = 350_000 #500000
+            splice_point = 350_000  # 500000
             # Step 2: Evaluate the KDE at the splice point to find
             # its value and derivative
             kde_value_at_splice = f_function(splice_point)
             # For the derivative, use a small delta to compute numerically
             delta = 0.0001 * splice_point
             kde_derivative_at_splice = (
-                f_function(splice_point + delta) -
-                f_function(splice_point - delta)) / (2 * delta)
+                f_function(splice_point + delta)
+                - f_function(splice_point - delta)
+            ) / (2 * delta)
 
             # Step 3: Define a Pareto distribution function that matches
             # at the splice point
@@ -337,19 +338,35 @@ def compute_income_dist(
             # alpha, scale = find_pareto_parameters()
 
             # Directly solve for alpha
-            alpha = -(kde_derivative_at_splice * splice_point / kde_value_at_splice) - 1
+            alpha = (
+                -(
+                    kde_derivative_at_splice
+                    * splice_point
+                    / kde_value_at_splice
+                )
+                - 1
+            )
 
             # Now solve for scale using the first equation:
             # kde_value = alpha * (scale^alpha) / (splice_point^(alpha+1))
             # scale^alpha = kde_value * splice_point^(alpha+1) / alpha
-            scale = (kde_value_at_splice * (splice_point**(alpha+1)) / alpha)**(1/alpha)
+            scale = (
+                kde_value_at_splice * (splice_point ** (alpha + 1)) / alpha
+            ) ** (1 / alpha)
 
             print(f"Calculated alpha: {alpha}")
             print(f"Calculated scale: {scale}")
 
             # Verify the Pareto value at the splice point
-            pareto_value = alpha * (scale**alpha) / (splice_point**(alpha+1))
-            pareto_derivative = -alpha * (alpha+1) * (scale**alpha) / (splice_point**(alpha+2))
+            pareto_value = (
+                alpha * (scale**alpha) / (splice_point ** (alpha + 1))
+            )
+            pareto_derivative = (
+                -alpha
+                * (alpha + 1)
+                * (scale**alpha)
+                / (splice_point ** (alpha + 2))
+            )
 
             print(f"KDE value at splice: {kde_value_at_splice}")
             print(f"Pareto value at splice: {pareto_value}")
@@ -357,7 +374,9 @@ def compute_income_dist(
 
             print(f"KDE derivative at splice: {kde_derivative_at_splice}")
             print(f"Pareto derivative at splice: {pareto_derivative}")
-            print(f"Derivative difference: {pareto_derivative - kde_derivative_at_splice}")
+            print(
+                f"Derivative difference: {pareto_derivative - kde_derivative_at_splice}"
+            )
 
             # Step 4: Define the spliced PDF function
             def spliced_pdf(x):
@@ -365,7 +384,7 @@ def spliced_pdf(x):
                     if x <= splice_point:
                         return f_function(x)
                     else:
-                        return alpha * (scale**alpha) / (x**(alpha+1))
+                        return alpha * (scale**alpha) / (x ** (alpha + 1))
                 else:
                     # Handle array inputs
                     x = np.asarray(x)
@@ -376,11 +395,13 @@ def spliced_pdf(x):
                     if np.any(kde_mask):
                         result[kde_mask] = f_function(x[kde_mask])
                     if np.any(pareto_mask):
-                        result[pareto_mask] = alpha * (scale**alpha) / (x[pareto_mask]**(alpha+1))
+                        result[pareto_mask] = (
+                            alpha
+                            * (scale**alpha)
+                            / (x[pareto_mask] ** (alpha + 1))
+                        )
                     return result
 
-
-
             # f = f_function(z_line)
             f = spliced_pdf(z_line)
             f = f / np.sum(f)  # ensure sum to one

From dd29dc41754677f9ca7b25434a120885c3cddf2f Mon Sep 17 00:00:00 2001
From: Jason DeBacker <jdebacker@users.noreply.github.com>
Date: Thu, 15 May 2025 15:28:00 -0400
Subject: [PATCH 5/5] don't divide by integral

---
 iot/inverse_optimal_tax.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/iot/inverse_optimal_tax.py b/iot/inverse_optimal_tax.py
index c7a19f6..45d0db8 100644
--- a/iot/inverse_optimal_tax.py
+++ b/iot/inverse_optimal_tax.py
@@ -511,7 +511,7 @@ def sw_weights(self):
         integral = np.trapz(
             g_z * self.f, self.z
         )  # renormalize to integrate to 1
-        g_z = g_z / integral
+        g_z = g_z #/ integral
 
         # use Lockwood and Weinzierl formula, which should be equivalent but using numerical differentiation
         bracket_term = (