From 0136489c4cfbe7070881baba87e84c10b7815d70 Mon Sep 17 00:00:00 2001 From: Jason DeBacker Date: Thu, 27 Mar 2025 18:36:50 -0400 Subject: [PATCH 1/5] edits to kde --- iot/inverse_optimal_tax.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/iot/inverse_optimal_tax.py b/iot/inverse_optimal_tax.py index c262d05..b8556f1 100644 --- a/iot/inverse_optimal_tax.py +++ b/iot/inverse_optimal_tax.py @@ -274,14 +274,21 @@ def compute_income_dist( ) elif dist_type == "kde": # uses the original full data for kde estimation + data2 = data.copy() + # for income > 500,000 replace actual values with draws from + # a pareto distribution + data2.loc[data2[income_measure] > 500_000, income_measure] = st.pareto.rvs( + 1.16, size=data2.loc[data2[income_measure] > 500_000].shape[0] + ) f_function = st.gaussian_kde( - data[income_measure], - # bw_method=kde_bw, + data2[income_measure], + bw_method=kde_bw, weights=data[weight_var], ) - f = f_function.pdf(z_line) + f = f_function(z_line) + f = f / np.sum(f) # ensure sum to one F = np.cumsum(f) - f_prime = np.gradient(f, edge_order=2) + f_prime = np.gradient(f, z_line, edge_order=2) elif dist_type == "Pln": def pln_pdf(y, mu, sigma, alpha): From 530fbe045ac58e17d070e001c870475fdfd8a2a3 Mon Sep 17 00:00:00 2001 From: Jason DeBacker Date: Tue, 1 Apr 2025 10:47:46 -0400 Subject: [PATCH 2/5] splice pareto --- iot/inverse_optimal_tax.py | 115 ++++++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 9 deletions(-) diff --git a/iot/inverse_optimal_tax.py b/iot/inverse_optimal_tax.py index b8556f1..c917059 100644 --- a/iot/inverse_optimal_tax.py +++ b/iot/inverse_optimal_tax.py @@ -5,6 +5,7 @@ from statsmodels.nonparametric.kernel_regression import KernelReg from scipy.interpolate import UnivariateSpline from scipy.linalg import lstsq +from scipy.optimize import minimize class IOT: @@ -246,12 +247,8 @@ def compute_income_dist( ).values / data[weight_var].sum() ).sum() - # F = st.lognorm.cdf(z_line, s=(sigmasq) ** 0.5, scale=np.exp(mu)) - # f = st.lognorm.pdf(z_line, s=(sigmasq) ** 0.5, scale=np.exp(mu)) - # f = f / np.sum(f) - # f_prime = np.gradient(f, edge_order=2) - # analytical derivative of lognormal + # analytical expressions for F, f, f_prime sigma = np.sqrt(sigmasq) F = (1 / 2) * ( 1 @@ -277,15 +274,115 @@ def compute_income_dist( data2 = data.copy() # for income > 500,000 replace actual values with draws from # a pareto distribution - data2.loc[data2[income_measure] > 500_000, income_measure] = st.pareto.rvs( - 1.16, size=data2.loc[data2[income_measure] > 500_000].shape[0] - ) + # data2.loc[data2[income_measure] > 500_000, income_measure] = st.pareto.rvs( + # 1.16, size=data2.loc[data2[income_measure] > 500_000].shape[0] + # ) f_function = st.gaussian_kde( data2[income_measure], bw_method=kde_bw, weights=data[weight_var], ) - f = f_function(z_line) + + # Now splice a Pareto distribution to the KDE + # Step 1: Define the splicing point + splice_point = 350_000 #500000 + # Step 2: Evaluate the KDE at the splice point to find + # its value and derivative + kde_value_at_splice = f_function(splice_point) + # For the derivative, use a small delta to compute numerically + delta = 0.0001 * splice_point + kde_derivative_at_splice = ( + f_function(splice_point + delta) - + f_function(splice_point - delta)) / (2 * delta) + + # Step 3: Define a Pareto distribution function that matches + # at the splice point + # The Pareto PDF is: α * x_m^α / x^(α+1) where x_m is the + # scale parameter and α is the shape parameter + + # def find_pareto_parameters(): + # def objective(params): + # alpha, scale = params + # # Calculate the Pareto PDF value at the splice point + # pareto_value = alpha * (scale**alpha) / (splice_point**(alpha+1)) + # # Calculate the Pareto derivative at the splice point + # pareto_derivative = -alpha * (alpha+1) * (scale**alpha) / (splice_point**(alpha+2)) + + # # Compute the error in matching both the value and derivative + # value_error = (pareto_value - kde_value_at_splice)**2 + # derivative_error = (pareto_derivative - kde_derivative_at_splice)**2 + + # # Return the combined error + # return value_error + derivative_error + + # # Initial guess for alpha and scale + # # initial_guess = [2.0, splice_point/2] + # initial_guess = [1.3, splice_point/5] + + # # Constraints: alpha > 0, scale > 0 + # bounds = [(0.1, 10), (0.1*splice_point, splice_point)] + + # # Minimize the objective function + # result = minimize( + # objective, initial_guess, bounds=bounds, + # method='L-BFGS-B') + # print("Minimization result:", result) + # if not result.success: + # raise ValueError("Optimization failed: " + result.message) + # if result.fun > 1e-6: + # raise ValueError("Optimization did not converge to a good solution.") + # return result.x + + # # Get the optimal Pareto parameters + # alpha, scale = find_pareto_parameters() + + # Directly solve for alpha + alpha = -(kde_derivative_at_splice * splice_point / kde_value_at_splice) - 1 + + # Now solve for scale using the first equation: + # kde_value = alpha * (scale^alpha) / (splice_point^(alpha+1)) + # scale^alpha = kde_value * splice_point^(alpha+1) / alpha + scale = (kde_value_at_splice * (splice_point**(alpha+1)) / alpha)**(1/alpha) + + print(f"Calculated alpha: {alpha}") + print(f"Calculated scale: {scale}") + + # Verify the Pareto value at the splice point + pareto_value = alpha * (scale**alpha) / (splice_point**(alpha+1)) + pareto_derivative = -alpha * (alpha+1) * (scale**alpha) / (splice_point**(alpha+2)) + + print(f"KDE value at splice: {kde_value_at_splice}") + print(f"Pareto value at splice: {pareto_value}") + print(f"Difference: {pareto_value - kde_value_at_splice}") + + print(f"KDE derivative at splice: {kde_derivative_at_splice}") + print(f"Pareto derivative at splice: {pareto_derivative}") + print(f"Derivative difference: {pareto_derivative - kde_derivative_at_splice}") + + # Step 4: Define the spliced PDF function + def spliced_pdf(x): + if np.isscalar(x): + if x <= splice_point: + return f_function(x) + else: + return alpha * (scale**alpha) / (x**(alpha+1)) + else: + # Handle array inputs + x = np.asarray(x) + result = np.zeros_like(x, dtype=float) + kde_mask = x <= splice_point + pareto_mask = x > splice_point + + if np.any(kde_mask): + result[kde_mask] = f_function(x[kde_mask]) + if np.any(pareto_mask): + result[pareto_mask] = alpha * (scale**alpha) / (x[pareto_mask]**(alpha+1)) + return result + + + + # f = f_function(z_line) + f = spliced_pdf(z_line) f = f / np.sum(f) # ensure sum to one F = np.cumsum(f) f_prime = np.gradient(f, z_line, edge_order=2) From 107779c567e61ce0b8a44c5c59cca7bd6c88a2df Mon Sep 17 00:00:00 2001 From: Jason DeBacker Date: Tue, 1 Apr 2025 10:48:05 -0400 Subject: [PATCH 3/5] start g_z at ndex 0 regardless of method --- iot/iot_user.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/iot/iot_user.py b/iot/iot_user.py index e392d58..c11109c 100644 --- a/iot/iot_user.py +++ b/iot/iot_user.py @@ -146,10 +146,10 @@ def plot(self, var="g_z"): + "=%{y:.3f}" ) else: - if var == "g_z_numerical": - start_idx = 10 # numerical approximation not great near 0 - else: - start_idx = 0 + # if var == "g_z_numerical": + # start_idx = 0 # numerical approximation not great near 0 + # else: + start_idx = 0 y = [] for i in self.iot: y.append(i.df()[var][start_idx:]) From 0534bde206e3201e59858bf9f7a55dfcd0429389 Mon Sep 17 00:00:00 2001 From: Jason DeBacker Date: Tue, 1 Apr 2025 10:54:30 -0400 Subject: [PATCH 4/5] format --- iot/inverse_optimal_tax.py | 45 ++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/iot/inverse_optimal_tax.py b/iot/inverse_optimal_tax.py index c917059..c7a19f6 100644 --- a/iot/inverse_optimal_tax.py +++ b/iot/inverse_optimal_tax.py @@ -285,15 +285,16 @@ def compute_income_dist( # Now splice a Pareto distribution to the KDE # Step 1: Define the splicing point - splice_point = 350_000 #500000 + splice_point = 350_000 # 500000 # Step 2: Evaluate the KDE at the splice point to find # its value and derivative kde_value_at_splice = f_function(splice_point) # For the derivative, use a small delta to compute numerically delta = 0.0001 * splice_point kde_derivative_at_splice = ( - f_function(splice_point + delta) - - f_function(splice_point - delta)) / (2 * delta) + f_function(splice_point + delta) + - f_function(splice_point - delta) + ) / (2 * delta) # Step 3: Define a Pareto distribution function that matches # at the splice point @@ -337,19 +338,35 @@ def compute_income_dist( # alpha, scale = find_pareto_parameters() # Directly solve for alpha - alpha = -(kde_derivative_at_splice * splice_point / kde_value_at_splice) - 1 + alpha = ( + -( + kde_derivative_at_splice + * splice_point + / kde_value_at_splice + ) + - 1 + ) # Now solve for scale using the first equation: # kde_value = alpha * (scale^alpha) / (splice_point^(alpha+1)) # scale^alpha = kde_value * splice_point^(alpha+1) / alpha - scale = (kde_value_at_splice * (splice_point**(alpha+1)) / alpha)**(1/alpha) + scale = ( + kde_value_at_splice * (splice_point ** (alpha + 1)) / alpha + ) ** (1 / alpha) print(f"Calculated alpha: {alpha}") print(f"Calculated scale: {scale}") # Verify the Pareto value at the splice point - pareto_value = alpha * (scale**alpha) / (splice_point**(alpha+1)) - pareto_derivative = -alpha * (alpha+1) * (scale**alpha) / (splice_point**(alpha+2)) + pareto_value = ( + alpha * (scale**alpha) / (splice_point ** (alpha + 1)) + ) + pareto_derivative = ( + -alpha + * (alpha + 1) + * (scale**alpha) + / (splice_point ** (alpha + 2)) + ) print(f"KDE value at splice: {kde_value_at_splice}") print(f"Pareto value at splice: {pareto_value}") @@ -357,7 +374,9 @@ def compute_income_dist( print(f"KDE derivative at splice: {kde_derivative_at_splice}") print(f"Pareto derivative at splice: {pareto_derivative}") - print(f"Derivative difference: {pareto_derivative - kde_derivative_at_splice}") + print( + f"Derivative difference: {pareto_derivative - kde_derivative_at_splice}" + ) # Step 4: Define the spliced PDF function def spliced_pdf(x): @@ -365,7 +384,7 @@ def spliced_pdf(x): if x <= splice_point: return f_function(x) else: - return alpha * (scale**alpha) / (x**(alpha+1)) + return alpha * (scale**alpha) / (x ** (alpha + 1)) else: # Handle array inputs x = np.asarray(x) @@ -376,11 +395,13 @@ def spliced_pdf(x): if np.any(kde_mask): result[kde_mask] = f_function(x[kde_mask]) if np.any(pareto_mask): - result[pareto_mask] = alpha * (scale**alpha) / (x[pareto_mask]**(alpha+1)) + result[pareto_mask] = ( + alpha + * (scale**alpha) + / (x[pareto_mask] ** (alpha + 1)) + ) return result - - # f = f_function(z_line) f = spliced_pdf(z_line) f = f / np.sum(f) # ensure sum to one From dd29dc41754677f9ca7b25434a120885c3cddf2f Mon Sep 17 00:00:00 2001 From: Jason DeBacker Date: Thu, 15 May 2025 15:28:00 -0400 Subject: [PATCH 5/5] don't divide by integral --- iot/inverse_optimal_tax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iot/inverse_optimal_tax.py b/iot/inverse_optimal_tax.py index c7a19f6..45d0db8 100644 --- a/iot/inverse_optimal_tax.py +++ b/iot/inverse_optimal_tax.py @@ -511,7 +511,7 @@ def sw_weights(self): integral = np.trapz( g_z * self.f, self.z ) # renormalize to integrate to 1 - g_z = g_z / integral + g_z = g_z #/ integral # use Lockwood and Weinzierl formula, which should be equivalent but using numerical differentiation bracket_term = (