plot_pareto.py does not plot pareto of build-times, it plots an average

I noticed that the function create_plot_build is erroneously calculating the average of the build times instead of finding the paretos that demarcate 80% recall, 90% recall, 95% recall, and 99% recall, the function is calculating the average build times for an 80% recall bucket, 90% recall bucket, 95% recall bucket, and 99% recall bucket.

**current implementation:**
def create_plot_build(
    build_results, search_results, linestyles, fn_out, dataset, k, n_queries
):
    bt_80 = [0] * len(linestyles)
    bt_90 = [0] * len(linestyles)
    bt_95 = [0] * len(linestyles)
    bt_99 = [0] * len(linestyles)

    data = OrderedDict()
    colors = OrderedDict()

    # Sorting by mean y-value helps aligning plots with labels
    def mean_y(algo):
        points = np.array(search_results[algo], dtype=object)
        return -np.log(np.array(points[:, 3], dtype=np.float32)).mean()

    for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)):
        points = np.array(search_results[algo], dtype=object)
        # x is recall, ls is algo_name, idxs is index_name
        xs = points[:, 2]
        ls = points[:, 0]
        idxs = points[:, 1]

        len_80, len_90, len_95, len_99 = 0, 0, 0, 0
        for i in range(len(xs)):
            build_key = (ls[i], idxs[i])
            if build_key not in build_results:
                continue  # Skip if build result not found
            
            if xs[i] >= 0.80 and xs[i] < 0.90:
                bt_80[pos] = bt_80[pos] + build_results[build_key][0][2]
                len_80 = len_80 + 1
            elif xs[i] >= 0.9 and xs[i] < 0.95:
                bt_90[pos] = bt_90[pos] + build_results[build_key][0][2]
                len_90 = len_90 + 1
            elif xs[i] >= 0.95 and xs[i] < 0.99:
                bt_95[pos] = bt_95[pos] + build_results[build_key][0][2]
                len_95 = len_95 + 1
            elif xs[i] >= 0.99:
                bt_99[pos] = bt_99[pos] + build_results[build_key][0][2]
                len_99 = len_99 + 1
        if len_80 > 0:
            bt_80[pos] = bt_80[pos] / len_80
        if len_90 > 0:
            bt_90[pos] = bt_90[pos] / len_90
        if len_95 > 0:
            bt_95[pos] = bt_95[pos] / len_95
        if len_99 > 0:
            bt_99[pos] = bt_99[pos] / len_99
        data[algo] = [
            bt_80[pos],
            bt_90[pos],
            bt_95[pos],
            bt_99[pos],
        ]
        colors[algo] = linestyles[algo][0]

    index = [
        "@80% Recall",
        "@90% Recall",
        "@95% Recall",
        "@99% Recall",
    ]

    df = pd.DataFrame(data, index=index)
    df.replace(0.0, np.nan, inplace=True)
    df = df.dropna(how="all")
    plt.figure(figsize=(12, 9))
    ax = df.plot.bar(rot=0, color=colors)
    fig = ax.get_figure()
    
    # Add speedup annotations
    if 'LUCENE_HNSW' in df.columns and 'CAGRA_HNSW' in df.columns:
        y_max = ax.get_ylim()[1]
        
        for i, bucket in enumerate(df.index):
            lucene_time = df.loc[bucket, 'LUCENE_HNSW']
            cagra_time = df.loc[bucket, 'CAGRA_HNSW']
            
            if pd.notna(lucene_time) and pd.notna(cagra_time) and lucene_time > 0 and cagra_time > 0:
                speedup = lucene_time / cagra_time
                # Position annotations just above the bars, below subtitle
                ax.text(i, y_max * 0.98, f'{speedup:.1f}x', 
                       ha='center', va='bottom', fontsize=9, fontweight='bold',
                       bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.9, edgecolor='gray'))
    
    print(f"writing build output to {fn_out}")
    plt.title(
        "Average Build Time within Recall Range "
        f"for k={k} n_queries={n_queries}"
    )
    plt.suptitle(f"{dataset}")
    plt.ylabel("Build Time (s)")
    fig.savefig(fn_out)

=======================
cc: @cjnolet @narangvivek10 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

plot_pareto.py does not plot pareto of build-times, it plots an average #9

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

plot_pareto.py does not plot pareto of build-times, it plots an average #9

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions