Skip to content

Commit c2afe48

Browse files
committed
Improved profiles and stratified mix to use topics from the profile, rather than infer them based upon generated samples. This ensures a consistent scope is defined
1 parent 658b34d commit c2afe48

16 files changed

Lines changed: 239 additions & 38 deletions

Docs/DATA_PROFILES.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,14 @@
3232
"description": "Generates tasks requiring reasoning about frontend, backend, database design, API behavior, deployment, state management, architectural choices, and full-stack debugging.",
3333
"audience": "Full-stack developers, software engineers, architects.",
3434
"recordType": "alpaca",
35+
"topics": [
36+
"frontend",
37+
"backend",
38+
"api",
39+
"databases",
40+
"architecture",
41+
"devops"
42+
],
3543
"useCases": [
3644
"Train reasoning about full-stack workflows",
3745
"Model backend/frontend interaction",
@@ -72,6 +80,7 @@
7280
- **Audience:** Full-stack developers, software engineers, architects.
7381
- **Record Type:** alpaca
7482
- **Use Cases:** Train reasoning about full-stack workflows; Model backend/frontend interaction; Debug end-to-end system behavior; Design scalable architectures; Analyze data flow and API correctness.
83+
- **Topics:** frontend; backend; api; databases; architecture; devops.
7584
- **Instruction Label:** Engineering Task
7685
- **Input Label:** System Context
7786
- **Output Label:** Final Answer

components/ProfilesView.tsx

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ const initialBuilderState = {
4848
description: '',
4949
audience: '',
5050
useCases: '',
51+
topics: '',
5152
instructionLabel: 'Instruction',
5253
inputLabel: 'Input',
5354
outputLabel: 'Response',
@@ -82,6 +83,7 @@ const ProfilesView: React.FC = () => {
8283
customId: 'profile-builder-custom-id',
8384
audience: 'profile-builder-audience',
8485
useCases: 'profile-builder-use-cases',
86+
topics: 'profile-builder-topics',
8587
instructionLabel: 'profile-builder-instruction-label',
8688
outputLabel: 'profile-builder-output-label',
8789
inputLabel: 'profile-builder-input-label',
@@ -155,6 +157,7 @@ const ProfilesView: React.FC = () => {
155157
if (!id) throw new Error('Provide either an ID or a name to derive the ID.');
156158
const useCases = normalizeUseCases(builder.useCases);
157159
if (useCases.length === 0) throw new Error('Enter at least one use case.');
160+
const topics = normalizeUseCases(builder.topics);
158161

159162
const metadataFields = parseJsonField(builder.metadataJson, []);
160163
const evaluationAxes = parseJsonField(builder.evaluationJson, []);
@@ -166,6 +169,7 @@ const ProfilesView: React.FC = () => {
166169
description: builder.description.trim(),
167170
audience: builder.audience.trim(),
168171
useCases,
172+
topics: topics.length > 0 ? topics : undefined,
169173
recordType: 'alpaca',
170174
recordSchema: {
171175
instructionLabel: builder.instructionLabel.trim() || 'Instruction',
@@ -337,6 +341,17 @@ const ProfilesView: React.FC = () => {
337341
required
338342
/>
339343
</div>
344+
<div>
345+
<label htmlFor={builderFieldIds.topics} className="text-xs uppercase tracking-wide text-gray-400">Topics (optional; newline or comma separated)</label>
346+
<textarea
347+
id={builderFieldIds.topics}
348+
name="topics"
349+
value={builder.topics}
350+
onChange={e => handleBuilderChange('topics', e.target.value)}
351+
className="w-full mt-1 rounded-md bg-gray-800 border border-gray-700 p-2 text-sm text-white h-20"
352+
placeholder="e.g. analytics&#10;governance&#10;performance"
353+
/>
354+
</div>
340355
</div>
341356

342357
<div className="grid gap-4 md:grid-cols-2">

components/StratifiedPlannerView.tsx

Lines changed: 87 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import React, { useEffect, useMemo, useState } from 'react';
22
import { useAppStore } from '../store/useAppStore';
3-
import type { DatasetRecord } from '../types';
3+
import { getDatasetProfile } from '../data/datasetProfiles';
4+
import type { DatasetRecord, DatasetProfile } from '../types';
45

56
type AxisKey = 'length' | 'domain' | 'difficulty';
67
type TargetConfig = Record<AxisKey, Record<string, number>>;
@@ -12,6 +13,7 @@ const DEFAULT_TARGETS: TargetConfig = {
1213
};
1314

1415
const STORAGE_KEY = 'stratified-targets';
16+
const buildStorageKey = (profileId: string) => `${STORAGE_KEY}-${profileId || 'default'}`;
1517

1618
const clamp = (value: number) => Math.max(0, Math.min(100, Number.isFinite(value) ? value : 0));
1719

@@ -42,43 +44,110 @@ const getDifficulty = (record: DatasetRecord): string => {
4244
return (record as any)?.metadata?.difficulty || 'unlabeled';
4345
};
4446

47+
const buildDomainTargets = (profile: DatasetProfile | null, records: DatasetRecord[]): Record<string, number> => {
48+
const fromTopics = profile?.topics && profile.topics.length > 0
49+
? profile.topics.map(topic => topic.toString().toLowerCase())
50+
: null;
51+
if (fromTopics) {
52+
const even = 100 / fromTopics.length;
53+
return fromTopics.reduce<Record<string, number>>((acc, key) => {
54+
acc[key] = Number(even.toFixed(2));
55+
return acc;
56+
}, {});
57+
}
58+
59+
const domainCounts: Record<string, number> = {};
60+
records.forEach(record => {
61+
const domainKey = getDomain(record).toString().toLowerCase();
62+
domainCounts[domainKey] = (domainCounts[domainKey] || 0) + 1;
63+
});
64+
const domainKeys = Object.keys(domainCounts);
65+
if (domainKeys.length === 0) return DEFAULT_TARGETS.domain;
66+
67+
const evenSplit = domainKeys.reduce<Record<string, number>>((acc, key) => {
68+
acc[key] = Number((100 / domainKeys.length).toFixed(2));
69+
return acc;
70+
}, {});
71+
72+
return evenSplit;
73+
};
74+
4575
export const StratifiedPlannerView: React.FC = () => {
4676
const dataset = useAppStore(state => state.dataset);
77+
const datasetProfileId = useAppStore(state => state.datasetProfileId);
4778
const updateConfig = useAppStore(state => state.updateConfig);
4879
const autoInject = useAppStore(state => state.config.autoInjectStrataPrompt ?? false);
4980
const lastHint = useAppStore(state => state.config.stratifiedNextAsk ?? '');
5081

51-
const [targets, setTargets] = useState<TargetConfig>(() => {
52-
if (typeof window === 'undefined') return DEFAULT_TARGETS;
82+
const filteredDataset = useMemo(() => {
83+
return dataset.filter(record => (record as any)?.profileId === datasetProfileId);
84+
}, [dataset, datasetProfileId]);
85+
86+
const storageKey = useMemo(() => buildStorageKey(String(datasetProfileId ?? 'default')), [datasetProfileId]);
87+
88+
const [targets, setTargets] = useState<TargetConfig>(DEFAULT_TARGETS);
89+
const [hasCustomTargets, setHasCustomTargets] = useState<boolean>(false);
90+
const [promptPreview, setPromptPreview] = useState<string>('');
91+
const [autoInjectEnabled, setAutoInjectEnabled] = useState<boolean>(autoInject);
92+
const activeProfile = useMemo(() => getDatasetProfile(datasetProfileId), [datasetProfileId]);
93+
94+
useEffect(() => {
95+
if (typeof window === 'undefined') return;
5396
try {
54-
const raw = window.localStorage.getItem(STORAGE_KEY);
55-
if (!raw) return DEFAULT_TARGETS;
97+
const raw = window.localStorage.getItem(storageKey);
98+
if (!raw) {
99+
setTargets({
100+
...DEFAULT_TARGETS,
101+
domain: buildDomainTargets(activeProfile, filteredDataset),
102+
});
103+
setHasCustomTargets(false);
104+
return;
105+
}
56106
const parsed = JSON.parse(raw);
57-
return { ...DEFAULT_TARGETS, ...parsed };
107+
setTargets({ ...DEFAULT_TARGETS, ...parsed });
108+
setHasCustomTargets(true);
58109
} catch {
59-
return DEFAULT_TARGETS;
110+
setTargets({
111+
...DEFAULT_TARGETS,
112+
domain: buildDomainTargets(activeProfile, filteredDataset),
113+
});
114+
setHasCustomTargets(false);
60115
}
61-
});
62-
const [promptPreview, setPromptPreview] = useState<string>('');
63-
const [autoInjectEnabled, setAutoInjectEnabled] = useState<boolean>(autoInject);
116+
}, [storageKey, filteredDataset, activeProfile]);
64117

65118
useEffect(() => {
66119
if (typeof window !== 'undefined') {
67-
window.localStorage.setItem(STORAGE_KEY, JSON.stringify(targets));
120+
window.localStorage.setItem(storageKey, JSON.stringify(targets));
68121
}
69-
}, [targets]);
122+
}, [targets, storageKey]);
70123

71124
useEffect(() => {
72125
setAutoInjectEnabled(autoInject);
73126
}, [autoInject]);
74127

128+
useEffect(() => {
129+
if (hasCustomTargets) return;
130+
if (!filteredDataset.length && !(activeProfile?.topics?.length)) return;
131+
const dynamicDefaults = {
132+
...DEFAULT_TARGETS,
133+
domain: buildDomainTargets(activeProfile, filteredDataset),
134+
};
135+
const currentDomain = targets.domain;
136+
const nextDomain = dynamicDefaults.domain;
137+
const sameKeys = Object.keys(currentDomain).length === Object.keys(nextDomain).length &&
138+
Object.keys(currentDomain).every(key => Math.abs((currentDomain[key] ?? 0) - (nextDomain[key] ?? 0)) < 0.001);
139+
if (!sameKeys) {
140+
setTargets(prev => ({ ...prev, domain: nextDomain }));
141+
}
142+
}, [filteredDataset, hasCustomTargets, targets.domain, activeProfile]);
143+
75144
const stats = useMemo(() => {
76145
const totals: Record<AxisKey, Record<string, number>> = {
77146
length: {},
78147
domain: {},
79148
difficulty: {},
80149
};
81-
dataset.forEach(record => {
150+
filteredDataset.forEach(record => {
82151
const lengthBucket = bucketLength(record);
83152
totals.length[lengthBucket] = (totals.length[lengthBucket] || 0) + 1;
84153

@@ -88,7 +157,7 @@ export const StratifiedPlannerView: React.FC = () => {
88157
const difficulty = getDifficulty(record).toString().toLowerCase();
89158
totals.difficulty[difficulty] = (totals.difficulty[difficulty] || 0) + 1;
90159
});
91-
const totalRecords = dataset.length || 1;
160+
const totalRecords = filteredDataset.length || 1;
92161
const toPercent = (counts: Record<string, number>) =>
93162
Object.fromEntries(
94163
Object.entries(counts).map(([k, v]) => [k, Number(((v / totalRecords) * 100).toFixed(2))])
@@ -101,7 +170,7 @@ export const StratifiedPlannerView: React.FC = () => {
101170
difficulty: toPercent(totals.difficulty),
102171
},
103172
};
104-
}, [dataset]);
173+
}, [filteredDataset]);
105174

106175
const computeGaps = (axis: AxisKey) => {
107176
const desired = targets[axis];
@@ -148,7 +217,7 @@ export const StratifiedPlannerView: React.FC = () => {
148217
useEffect(() => {
149218
recomputePrompt();
150219
// eslint-disable-next-line react-hooks/exhaustive-deps
151-
}, [dataset, targets]);
220+
}, [filteredDataset, targets]);
152221

153222
useEffect(() => {
154223
if (promptPreview && promptPreview !== lastHint) {
@@ -158,6 +227,7 @@ export const StratifiedPlannerView: React.FC = () => {
158227
}, [promptPreview]);
159228

160229
const handleTargetChange = (axis: AxisKey, bucket: string, value: number) => {
230+
setHasCustomTargets(true);
161231
setTargets(prev => ({
162232
...prev,
163233
[axis]: {
@@ -241,7 +311,7 @@ export const StratifiedPlannerView: React.FC = () => {
241311
</p>
242312
</div>
243313
<div className="bg-gray-800 border border-gray-700 rounded-lg px-4 py-2 text-sm text-gray-200">
244-
Dataset size: <span className="font-semibold text-white">{dataset.length.toLocaleString()}</span> records
314+
Dataset size: <span className="font-semibold text-white">{filteredDataset.length.toLocaleString()}</span> records
245315
</div>
246316
</header>
247317

0 commit comments

Comments
 (0)