@@ -271,3 +271,151 @@ fn gen_and_test(description, test_fn, model, max_attempts) {
271271 }
272272 return {code: code, passed: false, attempts: max_att}
273273}
274+
275+ # ── Best-of-N with LLM judge ────────────────────────────────────────────────
276+
277+ # best_of_n(prompt, n, criteria, system?, model?) → {answer, score, idx}
278+ # Generates N responses in parallel, judges all with llm_judge, returns best.
279+ fn best_of_n(prompt, n, criteria, system, model) {
280+ h m = _default(model, null)
281+ h sys = _default(system, null)
282+ h prompts = arr_fill({prompt: prompt, system: sys}, n)
283+ h responses = batch_llm_call(prompts, m)
284+ h scores = llm_judge(responses, criteria, m)
285+ h best_score = -1.0
286+ h best_idx = 0
287+ h i = 0
288+ while i < arr_len(scores) {
289+ if scores[i]["score"] > best_score {
290+ best_score = scores[i]["score"]
291+ best_idx = scores[i]["idx"]
292+ }
293+ i = i + 1
294+ }
295+ return {answer: responses[best_idx], score: best_score, idx: best_idx}
296+ }
297+
298+ # ── Parallel Self-Consistency CoT ───────────────────────────────────────────
299+
300+ # parallel_cot(prompt, n, model?) → {answer, confidence, all_answers}
301+ # Like cot_verify but fires all N calls in parallel using batch_llm_call.
302+ fn parallel_cot(prompt, n, model) {
303+ h m = _default(model, null)
304+ h sys = "Think step by step. After reasoning, write 'ANSWER:' followed by your final answer on its own line."
305+ h full = str_concat("Question: ", prompt, "\n\nReason carefully, then write ANSWER: <your answer>")
306+ h prompts = arr_fill({prompt: full, system: sys}, n)
307+ h raw_responses = batch_llm_call(prompts, m)
308+ h votes = {}
309+ h all_answers = []
310+ h i = 0
311+ while i < arr_len(raw_responses) {
312+ h parts = str_split(raw_responses[i], "ANSWER:")
313+ h ans = ""
314+ if arr_len(parts) >= 2 {
315+ ans = str_trim(parts[arr_len(parts) - 1])
316+ } else {
317+ ans = str_trim(raw_responses[i])
318+ }
319+ h short_ans = str_slice(ans, 0, 100)
320+ arr_push(all_answers, short_ans)
321+ if dict_has(votes, short_ans) {
322+ votes[short_ans] = votes[short_ans] + 1
323+ } else {
324+ votes[short_ans] = 1
325+ }
326+ i = i + 1
327+ }
328+ h best = ""
329+ h best_count = 0
330+ h keys = dict_keys(votes)
331+ h k = 0
332+ while k < arr_len(keys) {
333+ if votes[keys[k]] > best_count {
334+ best = keys[k]
335+ best_count = votes[keys[k]]
336+ }
337+ k = k + 1
338+ }
339+ return {answer: best, confidence: best_count / n, all_answers: all_answers, votes: votes}
340+ }
341+
342+ # ── Improve Until Convergence ───────────────────────────────────────────────
343+
344+ # improve_until(text, criteria, threshold, max_rounds, model?) → {text, score, rounds}
345+ # Repeatedly generates an improved version and judges it; stops when score >= threshold.
346+ fn improve_until(text, criteria, threshold, max_rounds, model) {
347+ h m = _default(model, null)
348+ h current = text
349+ h round = 0
350+ h score = 0.0
351+ h sys = "Improve the following text according to the given criteria. Return only the improved version."
352+ while round < max_rounds {
353+ h improve_prompt = str_concat(
354+ "Improve this based on: ", criteria, "\n\n", current
355+ )
356+ h candidate = llm_call(improve_prompt, m, sys)
357+ h judgment = llm_judge([current, candidate], criteria, m)
358+ h candidate_score = 0.0
359+ h original_score = 0.0
360+ h ji = 0
361+ while ji < arr_len(judgment) {
362+ if judgment[ji]["idx"] == 0 { original_score = judgment[ji]["score"] }
363+ if judgment[ji]["idx"] == 1 { candidate_score = judgment[ji]["score"] }
364+ ji = ji + 1
365+ }
366+ if candidate_score > original_score {
367+ current = candidate
368+ score = candidate_score
369+ } else {
370+ score = original_score
371+ }
372+ if score >= threshold { break }
373+ round = round + 1
374+ }
375+ return {text: current, score: score, rounds: round + 1}
376+ }
377+
378+ # ── Debate: two agents argue, judge picks winner ────────────────────────────
379+
380+ # debate(topic, rounds, model?) → {winner: "for"|"against", reasoning, transcript}
381+ fn debate(topic, rounds, model) {
382+ h m = _default(model, null)
383+ h for_sys = str_concat("You argue FOR: ", topic, ". Be concise and compelling.")
384+ h against_sys = str_concat("You argue AGAINST: ", topic, ". Be concise and compelling.")
385+ h for_args = []
386+ h against_args = []
387+ h transcript = []
388+ h r = 0
389+ while r < rounds {
390+ h context = str_concat("Topic: ", topic)
391+ if arr_len(for_args) > 0 {
392+ context = str_concat(context, "\n\nPrevious FOR arguments: ", for_args[arr_len(for_args) - 1])
393+ context = str_concat(context, "\nPrevious AGAINST arguments: ", against_args[arr_len(against_args) - 1])
394+ }
395+ h both = batch_llm_call([
396+ {prompt: context, system: for_sys},
397+ {prompt: context, system: against_sys}
398+ ], m)
399+ arr_push(for_args, both[0])
400+ arr_push(against_args, both[1])
401+ arr_push(transcript, {round: r + 1, for: both[0], against: both[1]})
402+ r = r + 1
403+ }
404+ h judge_sys = "You are an impartial debate judge. Output JSON: {\"winner\": \"for\" or \"against\", \"reasoning\": \"...\"}"
405+ h judge_prompt = str_concat(
406+ "Topic: ", topic, "\n\n",
407+ "FOR argument:\n", for_args[arr_len(for_args) - 1], "\n\n",
408+ "AGAINST argument:\n", against_args[arr_len(against_args) - 1], "\n\n",
409+ "Who made the stronger case?"
410+ )
411+ h verdict_raw = llm_call(judge_prompt, m, judge_sys)
412+ h verdict = extract_json(verdict_raw)
413+ if verdict == null {
414+ verdict = {winner: "tie", reasoning: verdict_raw}
415+ }
416+ return {
417+ winner: verdict["winner"],
418+ reasoning: verdict["reasoning"],
419+ transcript: transcript
420+ }
421+ }
0 commit comments