-
Notifications
You must be signed in to change notification settings - Fork 2
Open
Description
I noticed that the function create_plot_build is erroneously calculating the average of the build times instead of finding the paretos that demarcate 80% recall, 90% recall, 95% recall, and 99% recall, the function is calculating the average build times for an 80% recall bucket, 90% recall bucket, 95% recall bucket, and 99% recall bucket.
current implementation:
def create_plot_build(
build_results, search_results, linestyles, fn_out, dataset, k, n_queries
):
bt_80 = [0] * len(linestyles)
bt_90 = [0] * len(linestyles)
bt_95 = [0] * len(linestyles)
bt_99 = [0] * len(linestyles)
data = OrderedDict()
colors = OrderedDict()
# Sorting by mean y-value helps aligning plots with labels
def mean_y(algo):
points = np.array(search_results[algo], dtype=object)
return -np.log(np.array(points[:, 3], dtype=np.float32)).mean()
for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)):
points = np.array(search_results[algo], dtype=object)
# x is recall, ls is algo_name, idxs is index_name
xs = points[:, 2]
ls = points[:, 0]
idxs = points[:, 1]
len_80, len_90, len_95, len_99 = 0, 0, 0, 0
for i in range(len(xs)):
build_key = (ls[i], idxs[i])
if build_key not in build_results:
continue # Skip if build result not found
if xs[i] >= 0.80 and xs[i] < 0.90:
bt_80[pos] = bt_80[pos] + build_results[build_key][0][2]
len_80 = len_80 + 1
elif xs[i] >= 0.9 and xs[i] < 0.95:
bt_90[pos] = bt_90[pos] + build_results[build_key][0][2]
len_90 = len_90 + 1
elif xs[i] >= 0.95 and xs[i] < 0.99:
bt_95[pos] = bt_95[pos] + build_results[build_key][0][2]
len_95 = len_95 + 1
elif xs[i] >= 0.99:
bt_99[pos] = bt_99[pos] + build_results[build_key][0][2]
len_99 = len_99 + 1
if len_80 > 0:
bt_80[pos] = bt_80[pos] / len_80
if len_90 > 0:
bt_90[pos] = bt_90[pos] / len_90
if len_95 > 0:
bt_95[pos] = bt_95[pos] / len_95
if len_99 > 0:
bt_99[pos] = bt_99[pos] / len_99
data[algo] = [
bt_80[pos],
bt_90[pos],
bt_95[pos],
bt_99[pos],
]
colors[algo] = linestyles[algo][0]
index = [
"@80% Recall",
"@90% Recall",
"@95% Recall",
"@99% Recall",
]
df = pd.DataFrame(data, index=index)
df.replace(0.0, np.nan, inplace=True)
df = df.dropna(how="all")
plt.figure(figsize=(12, 9))
ax = df.plot.bar(rot=0, color=colors)
fig = ax.get_figure()
# Add speedup annotations
if 'LUCENE_HNSW' in df.columns and 'CAGRA_HNSW' in df.columns:
y_max = ax.get_ylim()[1]
for i, bucket in enumerate(df.index):
lucene_time = df.loc[bucket, 'LUCENE_HNSW']
cagra_time = df.loc[bucket, 'CAGRA_HNSW']
if pd.notna(lucene_time) and pd.notna(cagra_time) and lucene_time > 0 and cagra_time > 0:
speedup = lucene_time / cagra_time
# Position annotations just above the bars, below subtitle
ax.text(i, y_max * 0.98, f'{speedup:.1f}x',
ha='center', va='bottom', fontsize=9, fontweight='bold',
bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.9, edgecolor='gray'))
print(f"writing build output to {fn_out}")
plt.title(
"Average Build Time within Recall Range "
f"for k={k} n_queries={n_queries}"
)
plt.suptitle(f"{dataset}")
plt.ylabel("Build Time (s)")
fig.savefig(fn_out)
=======================
cc: @cjnolet @narangvivek10
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels