Skip to content

plot_pareto.py does not plot pareto of build-times, it plots an average #9

@nvzm123

Description

@nvzm123

I noticed that the function create_plot_build is erroneously calculating the average of the build times instead of finding the paretos that demarcate 80% recall, 90% recall, 95% recall, and 99% recall, the function is calculating the average build times for an 80% recall bucket, 90% recall bucket, 95% recall bucket, and 99% recall bucket.

current implementation:
def create_plot_build(
build_results, search_results, linestyles, fn_out, dataset, k, n_queries
):
bt_80 = [0] * len(linestyles)
bt_90 = [0] * len(linestyles)
bt_95 = [0] * len(linestyles)
bt_99 = [0] * len(linestyles)

data = OrderedDict()
colors = OrderedDict()

# Sorting by mean y-value helps aligning plots with labels
def mean_y(algo):
    points = np.array(search_results[algo], dtype=object)
    return -np.log(np.array(points[:, 3], dtype=np.float32)).mean()

for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)):
    points = np.array(search_results[algo], dtype=object)
    # x is recall, ls is algo_name, idxs is index_name
    xs = points[:, 2]
    ls = points[:, 0]
    idxs = points[:, 1]

    len_80, len_90, len_95, len_99 = 0, 0, 0, 0
    for i in range(len(xs)):
        build_key = (ls[i], idxs[i])
        if build_key not in build_results:
            continue  # Skip if build result not found
        
        if xs[i] >= 0.80 and xs[i] < 0.90:
            bt_80[pos] = bt_80[pos] + build_results[build_key][0][2]
            len_80 = len_80 + 1
        elif xs[i] >= 0.9 and xs[i] < 0.95:
            bt_90[pos] = bt_90[pos] + build_results[build_key][0][2]
            len_90 = len_90 + 1
        elif xs[i] >= 0.95 and xs[i] < 0.99:
            bt_95[pos] = bt_95[pos] + build_results[build_key][0][2]
            len_95 = len_95 + 1
        elif xs[i] >= 0.99:
            bt_99[pos] = bt_99[pos] + build_results[build_key][0][2]
            len_99 = len_99 + 1
    if len_80 > 0:
        bt_80[pos] = bt_80[pos] / len_80
    if len_90 > 0:
        bt_90[pos] = bt_90[pos] / len_90
    if len_95 > 0:
        bt_95[pos] = bt_95[pos] / len_95
    if len_99 > 0:
        bt_99[pos] = bt_99[pos] / len_99
    data[algo] = [
        bt_80[pos],
        bt_90[pos],
        bt_95[pos],
        bt_99[pos],
    ]
    colors[algo] = linestyles[algo][0]

index = [
    "@80% Recall",
    "@90% Recall",
    "@95% Recall",
    "@99% Recall",
]

df = pd.DataFrame(data, index=index)
df.replace(0.0, np.nan, inplace=True)
df = df.dropna(how="all")
plt.figure(figsize=(12, 9))
ax = df.plot.bar(rot=0, color=colors)
fig = ax.get_figure()

# Add speedup annotations
if 'LUCENE_HNSW' in df.columns and 'CAGRA_HNSW' in df.columns:
    y_max = ax.get_ylim()[1]
    
    for i, bucket in enumerate(df.index):
        lucene_time = df.loc[bucket, 'LUCENE_HNSW']
        cagra_time = df.loc[bucket, 'CAGRA_HNSW']
        
        if pd.notna(lucene_time) and pd.notna(cagra_time) and lucene_time > 0 and cagra_time > 0:
            speedup = lucene_time / cagra_time
            # Position annotations just above the bars, below subtitle
            ax.text(i, y_max * 0.98, f'{speedup:.1f}x', 
                   ha='center', va='bottom', fontsize=9, fontweight='bold',
                   bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.9, edgecolor='gray'))

print(f"writing build output to {fn_out}")
plt.title(
    "Average Build Time within Recall Range "
    f"for k={k} n_queries={n_queries}"
)
plt.suptitle(f"{dataset}")
plt.ylabel("Build Time (s)")
fig.savefig(fn_out)

=======================
cc: @cjnolet @narangvivek10

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions