diff --git a/data-pipeline/README.md b/data-pipeline/README.md new file mode 100644 index 0000000..0bc93b7 --- /dev/null +++ b/data-pipeline/README.md @@ -0,0 +1,242 @@ +# Log-to-Training Data Pipeline for Agentic AI (DMP 2026) + +## Core Idea + +In the OAN (OpenAgriNet) system, user interactions involve multi-step agent behavior (LLM + tools such as weather APIs, mandi prices, etc.). Raw logs from such systems are noisy, inconsistent and not directly suitable for training. + +This project builds a **data quality layer** that converts these raw logs into reliable, training-ready datasets. + +The pipeline: +- removes incorrect, weak and low-information responses +- detects inefficient multi-step behavior such as unnecessary tool/API calls +- ensures only reliable, high-quality interactions are used for training + +This directly improves the reliability and efficiency of responses generated by the OAN agentic AI system in real-world agricultural use cases. + + +- **Intelligent Quality Detection** - Filters out bad and weak responses +- **Explainable Filtering** - Shows WHY each entry was kept/removed +- **PII-Safe Processing** - Removes sensitive information +- **Training-Ready Exports** - JSONL format for LoRA fine-tuning +- **Inefficiency Detection** - Identifies and optimizes inefficient behavior + +## Where This Fits in OAN Architecture + +In the OAN pipeline: + +User → Voice/Text → Agentic LLM → Tools (APIs) → Response → Logs + +This project operates on the **logs generated by this system**. + +Logs → [This Pipeline] → Clean Training Data → Model Training + +By improving training data quality, this directly improves: +- response accuracy +- tool usage efficiency +- overall system reliability for farmers +- consistency of responses across different user queries + +This pipeline acts as a data quality layer between raw system logs and model training. + +## Example Pipeline Output + +``` +Total logs: 108 + +After filtering: +- Kept for training (high-quality): 100 +- Removed: + - Weak: 1 + - Bad: 7 + +Inefficient entries: 2 + +Removed entries summary: + - failure: error fetching (1) + - minimal response (1) + - empty response (1) + - too brief (1) + - failure: unable to (2) + - error only (2) + +Exported to data/output.jsonl +``` + +## Key Innovations + +### 1. **Multi-Tier Quality Assessment** +- **Bad entries**: Empty responses, minimal responses ("ok"), error responses +- **Weak entries**: Low-information responses (too brief or vague, e.g., "maybe", "might") +- **Good entries**: Helpful, detailed, professional responses +- **Explainable reasons**: Each filtered entry includes WHY it was removed + +### 2. **Smart Error Detection** +- **Not naive**: Doesn't flag mentions of "error" in context +- **Specific patterns**: Only actual failures like "error fetching", "unable to" +- **Failure categories**: Differentiates between different types of errors + +### 3. **Inefficiency Analysis** +- **Step-based detection**: >3 steps = inefficient +- **Optimization hints**: Provides suggestions for improvement +- **Training signals**: Includes inefficiency metadata in exports + +### 4. **Training-Ready Format** +- **JSONL structure**: One JSON object per line for streaming +- **Chat template**: User/assistant role format +- **Rich metadata**: Quality scores, reasons, optimization hints + +## Usage + +```bash +python main.py +``` + +**Output:** `data/output.jsonl` - 100 high-quality training examples + +## Folder Structure and Files + +### Root Directory +``` +data-pipeline/ +├── main.py # Main pipeline orchestrator +├── README.md # This documentation +├── requirements.txt # Python dependencies +├── data/ # Input and output data +└── src/ # Pipeline modules +``` + +### Data Directory (`data/`) +``` +data/ +├── raw_logs.json # Input: Raw production logs (108 entries) +└── output.jsonl # Output: Filtered training data (100 entries) +``` + +**`raw_logs.json`** - Contains raw agentic logs with: +- `input`: User question/request +- `output`: Bot response +- `agent_turns`: Array of interaction steps + +**`output.jsonl`** - Contains filtered training data with: +- `messages`: Chat template format (user/assistant roles) +- `metadata`: Quality scores, reasons, optimization hints + +### Source Modules (`src/`) + +#### `src/ingest.py` +**Purpose**: Load raw log data +**Function**: `load_logs(path)` - Reads JSON file and returns log array + +#### `src/clean.py` +**Purpose**: Remove Personally Identifiable Information (PII) +**Functions**: `remove_pii(text)` - Replaces sensitive data with placeholders +**Features**: +- Email detection → `` +- Phone detection → `` +- Government ID detection (Aadhaar, PAN) → `` + +#### `src/process.py` +**Purpose**: Process raw logs into structured format +**Functions**: +- `detect_inefficiency(entry)` - Identifies inefficient behavior (>3 steps) +- `process_logs(logs)` - Main processing function +**Features**: +- Calculates steps from `agent_turns` +- Applies inefficiency detection +- Cleans input/output text + +#### `src/score.py` +**Purpose**: Intelligent quality assessment and scoring +**Functions**: +- `is_bad(entry)` - Hard filters for obviously bad responses +- `is_weak(entry)` - Detects low-quality but not terrible responses +- `score_entry(entry)` - Main scoring function +**Quality Rules**: +- **Bad**: Empty, minimal ("ok"), error responses +- **Weak**: Too brief (<20 chars), uncertain language +- **Good**: Helpful, detailed responses + +#### `src/select.py` +**Purpose**: Diversity filtering and deduplication +**Functions**: +- `select_diverse(entries)` - Removes duplicate inputs +- `balance_by_steps(entries)` - Balances by complexity (unused in current version) +**Features**: +- Input normalization (lowercase, remove punctuation) +- Duplicate detection and removal + +#### `src/improve.py` +**Purpose**: Add optimization hints for inefficient entries +**Functions**: +- `add_optimization_hint(entry)` - Suggests improvements +**Features**: +- "Reduce unnecessary steps or tool calls" for inefficient entries +- "Efficient" for good entries + +#### `src/export.py` +**Purpose**: Export data in training-ready format +**Functions**: +- `format_chat_template(entry)` - Converts to chat template format +- `export_jsonl(data, path)` - Exports to JSONL file +**Features**: +- Chat template structure (user/assistant roles) +- Rich metadata inclusion (scores, reasons, hints) + +### Main Pipeline (`main.py`) + +**Purpose**: Orchestrates the entire pipeline +**Process**: +1. Load raw logs (`load_logs`) +2. Process and clean (`process_logs`) +3. Add optimization hints (`add_optimization_hint`) +4. Score entries (`score_entry`) +5. Separate by quality type +6. Apply diversity filtering (`select_diverse`) +7. Export results (`export_jsonl`) +8. Generate detailed logging report + + +## Quality Filtering Logic + +The pipeline uses a **three-tier quality system**: + +### Bad Entries (Always Removed) +- **Empty response**: `""` → Reason: "empty response" +- **Minimal response**: "ok", "yes", "done" → Reason: "minimal response" +- **Error only**: "err", "error", "failed" → Reason: "error only" +- **Failure patterns**: "error fetching", "unable to", "failed to" → Reason: "failure: [pattern]" + +### Weak Entries (Filtered Out) +- **Too brief**: <20 characters → Reason: "too brief" +- **Uncertain language**: "maybe", "might", "possibly" → Reason: "uncertain language" + +### Good Entries (Kept for Training) +- **Helpful responses**: Detailed, professional answers +- **Proper length**: >20 characters with substance +- **No uncertainty**: Confident, clear responses + +## Output Format + +Each training example in `output.jsonl`: +```json +{ + "messages": [ + {"role": "user", "content": "User question here"}, + {"role": "assistant", "content": "Helpful response here"} + ], + "metadata": { + "score": 2, + "steps": 3, + "inefficient": false, + "optimization_suggestion": "Efficient", + "quality_type": "good", + "quality_reason": "good response" + } +} +``` + +## Summary + +This project turns messy system logs into clean, reliable training data by filtering out weak or incorrect responses and keeping only interactions that actually help the model learn better. + + diff --git a/data-pipeline/data/output.jsonl b/data-pipeline/data/output.jsonl new file mode 100644 index 0000000..6789630 --- /dev/null +++ b/data-pipeline/data/output.jsonl @@ -0,0 +1,100 @@ +{"messages": [{"role": "user", "content": "What is the weather today? My email is "}, {"role": "assistant", "content": "The weather today is sunny with a high of 75\u00b0F. Perfect day for outdoor activities!"}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I create a new account?"}, {"role": "assistant", "content": "To create a new account, click the Sign Up button and fill in your details."}], "metadata": {"score": 2, "steps": 2, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "Show me my order history"}, {"role": "assistant", "content": "Here are your recent orders: Order #1234 - Delivered, Order #1235 - In Transit, Order #1236 - Processing. Each order includes detailed tracking information and delivery estimates."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is the weather today?"}, {"role": "assistant", "content": "The weather today is sunny with a high of 75\u00b0F. Perfect day for outdoor activities!"}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "Create account instructions"}, {"role": "assistant", "content": "To create a new account, click the Sign Up button and fill in your details."}], "metadata": {"score": 2, "steps": 2, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "Order history request"}, {"role": "assistant", "content": "Here are your recent orders: Order #1234 - Delivered, Order #1235 - In Transit, Order #1236 - Processing. Each order includes detailed tracking information and delivery estimates."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "Check my email for support"}, {"role": "assistant", "content": "I've sent a support ticket confirmation to your email address. You should receive it within 5 minutes."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What are your business hours?"}, {"role": "assistant", "content": "Our business hours are Monday-Friday 9AM-6PM EST. We're closed on weekends and major holidays."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "I need to cancel my subscription"}, {"role": "assistant", "content": "To cancel your subscription, go to Account Settings > Subscription > Cancel. Your access will continue until the end of your billing period."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I track my package?"}, {"role": "assistant", "content": "You can track your package using the tracking number in your confirmation email. Visit our website and enter the tracking number in the Track Order section."}], "metadata": {"score": 3, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What payment methods do you accept?"}, {"role": "assistant", "content": "We accept all major credit cards (Visa, MasterCard, American Express), PayPal, and Apple Pay. All transactions are secured with 256-bit encryption."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "I want to return an item"}, {"role": "assistant", "content": "Returns are accepted within 30 days of purchase. Please visit our Returns Center to start the process. Items must be in original condition."}], "metadata": {"score": 3, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I contact customer support?"}, {"role": "assistant", "content": "You can reach customer support via email at , phone at 1-800-123-4567, or live chat on our website. Response time is typically within 24 hours."}], "metadata": {"score": 3, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your refund policy?"}, {"role": "assistant", "content": "We offer full refunds within 30 days of purchase. After 30 days, store credit is available. Refunds process within 5-7 business days."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I update my billing information?"}, {"role": "assistant", "content": "Go to Account Settings > Billing > Update Payment Method. Enter your new payment details and save changes."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What shipping options are available?"}, {"role": "assistant", "content": "We offer Standard (5-7 days), Express (2-3 days), and Overnight shipping. Free shipping on orders over $50."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I change my password?"}, {"role": "assistant", "content": "Click Forgot Password on the login page. Enter your email and follow the link sent to you. Create a new password with at least 8 characters."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your privacy policy?"}, {"role": "assistant", "content": "We protect your data with industry-standard security. We never sell personal information. Full policy available at our website."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I delete my account?"}, {"role": "assistant", "content": "To delete your account, go to Account Settings > Delete Account. This action is permanent and cannot be undone. All data will be removed."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What features are included in premium?"}, {"role": "assistant", "content": "Premium includes unlimited storage, priority support, advanced analytics, and exclusive content. $9.99/month or $99/year."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I download my data?"}, {"role": "assistant", "content": "Go to Account Settings > Privacy > Download Data. We'll email you a link to download your complete data archive within 24 hours."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What browsers do you support?"}, {"role": "assistant", "content": "We support Chrome (latest), Firefox (latest), Safari (latest), and Edge (latest). Mobile browsers supported on iOS 12+ and Android 8+."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I report a bug?"}, {"role": "assistant", "content": "Report bugs via Help > Report Issue or email . Include steps to reproduce, browser info, and screenshots if possible."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your API rate limit?"}, {"role": "assistant", "content": "Free tier: 100 requests/hour. Pro tier: 1000 requests/hour. Enterprise: unlimited. Rate limits reset hourly."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I integrate with third-party apps?"}, {"role": "assistant", "content": "Visit Integrations in settings to connect with Slack, Google Workspace, Microsoft Teams, and 50+ other apps. OAuth authentication required."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What languages do you support?"}, {"role": "assistant", "content": "We support English, Spanish, French, German, Chinese, Japanese, and Korean. More languages coming soon."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I enable two-factor authentication?"}, {"role": "assistant", "content": "Go to Account Settings > Security > Enable 2FA. Choose between SMS or authenticator app. Backup codes provided for recovery."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your uptime guarantee?"}, {"role": "assistant", "content": "We guarantee 99.9% uptime for paid plans. Free tier has no guarantee. Status page available at status.example.com."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I upgrade my plan?"}, {"role": "assistant", "content": "Go to Account Settings > Subscription > Upgrade. Choose your new plan and enter payment details. Changes take effect immediately."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is included in the free trial?"}, {"role": "assistant", "content": "14-day free trial includes all premium features. No credit card required. Trial converts to paid plan unless cancelled."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I export my reports?"}, {"role": "assistant", "content": "Reports can be exported as PDF, Excel, or CSV. Go to Reports > Export > Choose format. Large reports emailed to you."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your cancellation policy?"}, {"role": "assistant", "content": "Cancel anytime. Access continues until end of billing period. No refunds for partial months. Annual plans can be cancelled with pro-rated refund."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I set up notifications?"}, {"role": "assistant", "content": "Go to Settings > Notifications. Choose email, SMS, or push notifications. Customize frequency and types of alerts."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What training resources are available?"}, {"role": "assistant", "content": "We offer video tutorials, documentation, webinars, and a knowledge base. Premium users get 1-on-1 training sessions."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage team members?"}, {"role": "assistant", "content": "Go to Team Settings > Invite Members. Send invitations via email. Set roles and permissions. Remove members as needed."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What security measures do you have?"}, {"role": "assistant", "content": "256-bit encryption, regular security audits, SOC 2 Type II compliance, GDPR compliance, and 24/7 security monitoring."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I customize my dashboard?"}, {"role": "assistant", "content": "Drag and drop widgets to rearrange. Click Edit to add/remove widgets. Save custom layouts for different views."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What mobile apps do you have?"}, {"role": "assistant", "content": "iOS and Android apps available. Features include full functionality, offline mode, and push notifications."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I schedule meetings?"}, {"role": "assistant", "content": "Use the calendar integration to schedule meetings. Send invites directly from the app. Sync with Google Calendar or Outlook."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your data retention policy?"}, {"role": "assistant", "content": "Free accounts: 90 days. Paid accounts: 1 year. Deleted accounts: 30 days before permanent removal. GDPR rights available."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I create templates?"}, {"role": "assistant", "content": "Go to Templates > Create New. Design your template, save it, and reuse for similar projects. Share with team members."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What analytics are available?"}, {"role": "assistant", "content": "Real-time dashboards, custom reports, trend analysis, user behavior tracking, and export capabilities for deeper analysis."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I backup my data?"}, {"role": "assistant", "content": "Automatic daily backups included. Manual backups available in Settings > Backup. Download backup files anytime."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What integrations are available?"}, {"role": "assistant", "content": "Slack, Google Workspace, Microsoft 365, Salesforce, Zapier, and 100+ more apps via our API marketplace."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I search my data?"}, {"role": "assistant", "content": "Use the search bar with filters for date, type, and keywords. Advanced search supports boolean operators and saved searches."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What customer support levels are there?"}, {"role": "assistant", "content": "Basic: email support (48h response). Pro: email + chat (24h). Enterprise: phone + dedicated account manager (1h)."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I set up automation?"}, {"role": "assistant", "content": "Go to Automation > Create Rule. Set triggers, conditions, and actions. Test rules before activating. Monitor execution logs."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What are the system requirements?"}, {"role": "assistant", "content": "Web: modern browser. Mobile: iOS 12+ or Android 8+. Desktop: Windows 10+, macOS 10.14+. 2GB RAM minimum."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I share files?"}, {"role": "assistant", "content": "Click Share on any file. Generate link or invite via email. Set permissions and expiration dates. Track access."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What collaboration features exist?"}, {"role": "assistant", "content": "Real-time editing, comments, version history, task assignment, and team workspaces with granular permissions."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage subscriptions?"}, {"role": "assistant", "content": "View all subscriptions in Account Settings. Pause, cancel, or modify anytime. Upgrade/downgrade with prorated billing."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your roadmap?"}, {"role": "assistant", "content": "Q1: AI features. Q2: mobile improvements. Q3: advanced analytics. Q4: enterprise features. Public roadmap available."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I provide feedback?"}, {"role": "assistant", "content": "Use the feedback button in-app, email , or join our community forum. We respond to all feedback."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What compliance standards do you follow?"}, {"role": "assistant", "content": "GDPR, CCPA, HIPAA, SOC 2 Type II, ISO 27001. Regular third-party audits and compliance reports available."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I import data?"}, {"role": "assistant", "content": "Support CSV, Excel, JSON formats. Go to Import > Select file > Map fields > Preview > Import. Large files processed in background."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your pricing model?"}, {"role": "assistant", "content": "Free: $0/month. Pro: $9.99/month. Team: $19.99/user/month. Enterprise: custom pricing. Annual billing saves 20%."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage permissions?"}, {"role": "assistant", "content": "Role-based access control. Create custom roles, set granular permissions, and assign to users. Audit trail available."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What monitoring tools are included?"}, {"role": "assistant", "content": "Performance metrics, error tracking, uptime monitoring, user analytics, and custom alerts via email or webhook."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I configure email settings?"}, {"role": "assistant", "content": "Go to Settings > Email. Configure SMTP, set signatures, create templates, and manage auto-responders. Test before saving."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your disaster recovery plan?"}, {"role": "assistant", "content": "Multiple data centers, automated failover, daily backups, 99.99% recovery time objective. Full recovery within 4 hours."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I use the API?"}, {"role": "assistant", "content": "RESTful API with comprehensive documentation. API keys in settings. Rate limits apply. SDKs available for Python, JS, Java."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What reporting features exist?"}, {"role": "assistant", "content": "Custom reports, scheduled reports, dashboards, data visualization, and export to multiple formats. Real-time updates."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage storage?"}, {"role": "assistant", "content": "View usage in Account Settings. Upgrade storage or archive old data. Automatic cleanup rules available. Compression enabled."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your SLA?"}, {"role": "assistant", "content": "99.9% uptime for paid plans, 99.5% for free. Support response: 1h enterprise, 24h pro, 48h free. Credit for downtime."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I set up SSO?"}, {"role": "assistant", "content": "Go to Security > SSO Configuration. Support SAML 2.0, OAuth 2.0, and LDAP. Test connection before enabling for all users."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What mobile features are available?"}, {"role": "assistant", "content": "Full app functionality, offline mode, push notifications, biometric authentication, and tablet-optimized interface."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I create workflows?"}, {"role": "assistant", "content": "Use the workflow builder to create multi-step processes. Add conditions, branches, and automation. Test before deployment."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your data processing policy?"}, {"role": "assistant", "content": "Data processed in secure data centers. No data sold to third parties. Anonymized data may be used for service improvement."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage user accounts?"}, {"role": "assistant", "content": "Bulk import via CSV, individual creation, or SSO sync. Set roles, permissions, and access levels. Deactivate unused accounts."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What customization options exist?"}, {"role": "assistant", "content": "Custom branding, themes, layouts, workflows, and fields. White-label options for enterprise plans."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I troubleshoot issues?"}, {"role": "assistant", "content": "Check status page, review error logs, clear cache, update browser/app. Contact support with error details and steps."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your deprecation policy?"}, {"role": "assistant", "content": "6 months notice for feature deprecation. 12 months for API changes. Migration guides and support provided."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage notifications?"}, {"role": "assistant", "content": "Configure email, SMS, push, in-app notifications. Set frequency, quiet hours, and priority levels. Custom rules available."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What testing tools are included?"}, {"role": "assistant", "content": "A/B testing, feature flags, beta testing, sandbox environment, and performance monitoring. Gradual rollout support."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I optimize performance?"}, {"role": "assistant", "content": "Enable caching, optimize queries, use CDN, compress images, and monitor metrics. Performance reports available weekly."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your incident response process?"}, {"role": "assistant", "content": "Immediate detection, assessment within 15min, resolution within 4h, post-incident review within 48h. Communication updates hourly."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage licenses?"}, {"role": "assistant", "content": "View license usage, assign/unassign seats, bulk purchase, and set auto-renewal. License pool optimization available."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your change management process?"}, {"role": "assistant", "content": "Bi-weekly releases, feature flags, gradual rollouts, rollback capability, and advance notifications for breaking changes."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I configure integrations?"}, {"role": "assistant", "content": "Browse integration marketplace, click Connect, authenticate via OAuth, configure settings, and test connection. Monitor status."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What documentation is available?"}, {"role": "assistant", "content": "API docs, user guides, video tutorials, FAQ, community forum, and developer blog. Updated weekly with new features."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage access tokens?"}, {"role": "assistant", "content": "Generate API tokens in settings, set expiration dates, scope permissions, and rotate regularly. Audit token usage logs."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your scalability approach?"}, {"role": "assistant", "content": "Auto-scaling infrastructure, load balancing, database sharding, CDN distribution, and geographic redundancy."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I handle large datasets?"}, {"role": "assistant", "content": "Chunked processing, streaming uploads, background jobs, progress tracking, and result notifications. Optimized for millions of records."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your code quality process?"}, {"role": "assistant", "content": "Code reviews, automated testing, CI/CD pipeline, static analysis, security scanning, and performance benchmarking."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage feature flags?"}, {"role": "assistant", "content": "Create flags in settings, define targeting rules, set rollout percentages, monitor metrics, and schedule automatic rollouts."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your user research process?"}, {"role": "assistant", "content": "User interviews, surveys, usability testing, analytics review, and feedback synthesis. Research shared with product team monthly."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I optimize costs?"}, {"role": "assistant", "content": "Monitor usage, set budgets, enable auto-scaling, archive old data, and review subscription tiers regularly."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your competitive advantage?"}, {"role": "assistant", "content": "Superior user experience, comprehensive features, excellent support, competitive pricing, and continuous innovation."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage dependencies?"}, {"role": "assistant", "content": "Track software dependencies, update regularly, security scan weekly, test compatibility, and maintain documentation."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your innovation strategy?"}, {"role": "assistant", "content": "User-driven development, rapid prototyping, A/B testing, data-driven decisions, and quarterly innovation sprints."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I handle edge cases?"}, {"role": "assistant", "content": "Comprehensive error handling, fallback mechanisms, user-friendly messages, logging for debugging, and graceful degradation."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your team structure?"}, {"role": "assistant", "content": "Cross-functional teams with product, engineering, design, and support. Agile methodology with 2-week sprints."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I ensure accessibility?"}, {"role": "assistant", "content": "WCAG 2.1 AA compliance, keyboard navigation, screen reader support, high contrast themes, and regular accessibility audits."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your learning platform?"}, {"role": "assistant", "content": "Interactive tutorials, video courses, certification program, webinars, and community knowledge sharing."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage beta programs?"}, {"role": "assistant", "content": "Invite-only beta testing, feedback collection, bug bounty program, early access features, and beta community forums."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your sustainability commitment?"}, {"role": "assistant", "content": "Carbon-neutral data centers, renewable energy, remote-first work, digital-first documentation, and green hosting providers."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I handle internationalization?"}, {"role": "assistant", "content": "Multi-language support, currency conversion, timezone handling, cultural adaptation, and regional compliance."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your partnership program?"}, {"role": "assistant", "content": "Technology partners, reseller program, integration marketplace, co-marketing opportunities, and revenue sharing models."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "How do I manage quality assurance?"}, {"role": "assistant", "content": "Automated testing, manual QA, performance testing, security testing, user acceptance testing, and continuous monitoring."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} +{"messages": [{"role": "user", "content": "What is your vision for the future?"}, {"role": "assistant", "content": "AI-powered automation, seamless integrations, predictive analytics, global expansion, and industry-specific solutions."}], "metadata": {"score": 2, "steps": 3, "inefficient": false, "optimization_suggestion": "Efficient", "quality_type": "good", "quality_reason": "good response"}} diff --git a/data-pipeline/data/raw_logs.json b/data-pipeline/data/raw_logs.json new file mode 100644 index 0000000..1a066cd --- /dev/null +++ b/data-pipeline/data/raw_logs.json @@ -0,0 +1,542 @@ +[ + { + "input": "What is the weather today? My email is john.doe@example.com", + "output": "The weather today is sunny with a high of 75°F. Perfect day for outdoor activities!", + "agent_turns": ["greeting", "weather_query", "response"] + }, + { + "input": "What is weather?", + "output": "error fetching data", + "agent_turns": ["greeting", "weather_query", "response", "error"] + }, + { + "input": "Hello", + "output": "ok", + "agent_turns": ["greeting", "response"] + }, + { + "input": "Get crop info", + "output": "", + "agent_turns": ["query", "empty_response"] + }, + { + "input": "Weather in Mumbai", + "output": "Rain expected", + "agent_turns": ["greeting", "location_query", "weather_fetch", "data_process", "response", "confirmation"] + }, + { + "input": "Help me reset my password. My phone is 1234567890", + "output": "Error: Unable to process password reset request.", + "agent_turns": ["authentication", "reset_request", "error"] + }, + { + "input": "How do I create a new account?", + "output": "To create a new account, click the Sign Up button and fill in your details.", + "agent_turns": ["welcome", "instructions"] + }, + { + "input": "Why is my payment failing? Contact me at user@test.com or 5556667777", + "output": "err", + "agent_turns": ["greeting", "problem_description", "troubleshooting"] + }, + { + "input": "Show me my order history", + "output": "Here are your recent orders: Order #1234 - Delivered, Order #1235 - In Transit, Order #1236 - Processing. Each order includes detailed tracking information and delivery estimates.", + "agent_turns": ["authentication", "order_lookup", "response"] + }, + { + "input": "What is the weather today?", + "output": "The weather today is sunny with a high of 75°F. Perfect day for outdoor activities!", + "agent_turns": ["greeting", "weather_query", "response"] + }, + { + "input": "How do I reset my password?", + "output": "Error: Unable to process password reset request.", + "agent_turns": ["authentication", "reset_request", "error"] + }, + { + "input": "Create account instructions", + "output": "To create a new account, click the Sign Up button and fill in your details.", + "agent_turns": ["welcome", "instructions"] + }, + { + "input": "Payment failure help", + "output": "err", + "agent_turns": ["greeting", "problem_description", "troubleshooting"] + }, + { + "input": "Order history request", + "output": "Here are your recent orders: Order #1234 - Delivered, Order #1235 - In Transit, Order #1236 - Processing. Each order includes detailed tracking information and delivery estimates.", + "agent_turns": ["authentication", "order_lookup", "response"] + }, + { + "input": "Check my email for support", + "output": "I've sent a support ticket confirmation to your email address. You should receive it within 5 minutes.", + "agent_turns": ["verification", "ticket_creation", "confirmation"] + }, + { + "input": "What are your business hours?", + "output": "Our business hours are Monday-Friday 9AM-6PM EST. We're closed on weekends and major holidays.", + "agent_turns": ["greeting", "information", "closing"] + }, + { + "input": "I need to cancel my subscription", + "output": "To cancel your subscription, go to Account Settings > Subscription > Cancel. Your access will continue until the end of your billing period.", + "agent_turns": ["authentication", "navigation", "instructions"] + }, + { + "input": "How do I track my package?", + "output": "You can track your package using the tracking number in your confirmation email. Visit our website and enter the tracking number in the Track Order section.", + "agent_turns": ["verification", "instructions", "resources"] + }, + { + "input": "What payment methods do you accept?", + "output": "We accept all major credit cards (Visa, MasterCard, American Express), PayPal, and Apple Pay. All transactions are secured with 256-bit encryption.", + "agent_turns": ["information", "security", "options"] + }, + { + "input": "I want to return an item", + "output": "Returns are accepted within 30 days of purchase. Please visit our Returns Center to start the process. Items must be in original condition.", + "agent_turns": ["policy", "process", "requirements"] + }, + { + "input": "How do I contact customer support?", + "output": "You can reach customer support via email at support@example.com, phone at 1-800-123-4567, or live chat on our website. Response time is typically within 24 hours.", + "agent_turns": ["options", "contact_info", "expectations"] + }, + { + "input": "What is your refund policy?", + "output": "We offer full refunds within 30 days of purchase. After 30 days, store credit is available. Refunds process within 5-7 business days.", + "agent_turns": ["policy", "timeframe", "process"] + }, + { + "input": "How do I update my billing information?", + "output": "Go to Account Settings > Billing > Update Payment Method. Enter your new payment details and save changes.", + "agent_turns": ["navigation", "instructions", "confirmation"] + }, + { + "input": "What shipping options are available?", + "output": "We offer Standard (5-7 days), Express (2-3 days), and Overnight shipping. Free shipping on orders over $50.", + "agent_turns": ["options", "pricing", "promotions"] + }, + { + "input": "How do I change my password?", + "output": "Click Forgot Password on the login page. Enter your email and follow the link sent to you. Create a new password with at least 8 characters.", + "agent_turns": ["initiation", "email_verification", "requirements"] + }, + { + "input": "What is your privacy policy?", + "output": "We protect your data with industry-standard security. We never sell personal information. Full policy available at our website.", + "agent_turns": ["security", "commitment", "resources"] + }, + { + "input": "How do I delete my account?", + "output": "To delete your account, go to Account Settings > Delete Account. This action is permanent and cannot be undone. All data will be removed.", + "agent_turns": ["warning", "process", "consequences"] + }, + { + "input": "What features are included in premium?", + "output": "Premium includes unlimited storage, priority support, advanced analytics, and exclusive content. $9.99/month or $99/year.", + "agent_turns": ["features", "pricing", "value_proposition"] + }, + { + "input": "How do I download my data?", + "output": "Go to Account Settings > Privacy > Download Data. We'll email you a link to download your complete data archive within 24 hours.", + "agent_turns": ["navigation", "process", "timeline"] + }, + { + "input": "What browsers do you support?", + "output": "We support Chrome (latest), Firefox (latest), Safari (latest), and Edge (latest). Mobile browsers supported on iOS 12+ and Android 8+.", + "agent_turns": ["compatibility", "requirements", "mobile"] + }, + { + "input": "How do I report a bug?", + "output": "Report bugs via Help > Report Issue or email bugs@example.com. Include steps to reproduce, browser info, and screenshots if possible.", + "agent_turns": ["channels", "requirements", "best_practices"] + }, + { + "input": "What is your API rate limit?", + "output": "Free tier: 100 requests/hour. Pro tier: 1000 requests/hour. Enterprise: unlimited. Rate limits reset hourly.", + "agent_turns": ["tiers", "limits", "timing"] + }, + { + "input": "How do I integrate with third-party apps?", + "output": "Visit Integrations in settings to connect with Slack, Google Workspace, Microsoft Teams, and 50+ other apps. OAuth authentication required.", + "agent_turns": ["navigation", "options", "authentication"] + }, + { + "input": "What languages do you support?", + "output": "We support English, Spanish, French, German, Chinese, Japanese, and Korean. More languages coming soon.", + "agent_turns": ["current_languages", "future_plans", "accessibility"] + }, + { + "input": "How do I enable two-factor authentication?", + "output": "Go to Account Settings > Security > Enable 2FA. Choose between SMS or authenticator app. Backup codes provided for recovery.", + "agent_turns": ["navigation", "options", "recovery"] + }, + { + "input": "What is your uptime guarantee?", + "output": "We guarantee 99.9% uptime for paid plans. Free tier has no guarantee. Status page available at status.example.com.", + "agent_turns": ["guarantee", "limitations", "monitoring"] + }, + { + "input": "How do I upgrade my plan?", + "output": "Go to Account Settings > Subscription > Upgrade. Choose your new plan and enter payment details. Changes take effect immediately.", + "agent_turns": ["navigation", "process", "timing"] + }, + { + "input": "What is included in the free trial?", + "output": "14-day free trial includes all premium features. No credit card required. Trial converts to paid plan unless cancelled.", + "agent_turns": ["duration", "features", "conversion"] + }, + { + "input": "How do I export my reports?", + "output": "Reports can be exported as PDF, Excel, or CSV. Go to Reports > Export > Choose format. Large reports emailed to you.", + "agent_turns": ["options", "process", "delivery"] + }, + { + "input": "What is your cancellation policy?", + "output": "Cancel anytime. Access continues until end of billing period. No refunds for partial months. Annual plans can be cancelled with pro-rated refund.", + "agent_turns": ["timing", "refund_policy", "exceptions"] + }, + { + "input": "How do I set up notifications?", + "output": "Go to Settings > Notifications. Choose email, SMS, or push notifications. Customize frequency and types of alerts.", + "agent_turns": ["navigation", "options", "customization"] + }, + { + "input": "What training resources are available?", + "output": "We offer video tutorials, documentation, webinars, and a knowledge base. Premium users get 1-on-1 training sessions.", + "agent_turns": ["resources", "formats", "premium_benefits"] + }, + { + "input": "How do I manage team members?", + "output": "Go to Team Settings > Invite Members. Send invitations via email. Set roles and permissions. Remove members as needed.", + "agent_turns": ["navigation", "invitation", "permissions"] + }, + { + "input": "What security measures do you have?", + "output": "256-bit encryption, regular security audits, SOC 2 Type II compliance, GDPR compliance, and 24/7 security monitoring.", + "agent_turns": ["encryption", "compliance", "monitoring"] + }, + { + "input": "How do I customize my dashboard?", + "output": "Drag and drop widgets to rearrange. Click Edit to add/remove widgets. Save custom layouts for different views.", + "agent_turns": ["interaction", "customization", "saving"] + }, + { + "input": "What mobile apps do you have?", + "output": "iOS and Android apps available. Features include full functionality, offline mode, and push notifications.", + "agent_turns": ["platforms", "features", "offline"] + }, + { + "input": "How do I schedule meetings?", + "output": "Use the calendar integration to schedule meetings. Send invites directly from the app. Sync with Google Calendar or Outlook.", + "agent_turns": ["integration", "invitations", "sync"] + }, + { + "input": "What is your data retention policy?", + "output": "Free accounts: 90 days. Paid accounts: 1 year. Deleted accounts: 30 days before permanent removal. GDPR rights available.", + "agent_turns": ["tiers", "timing", "compliance"] + }, + { + "input": "How do I create templates?", + "output": "Go to Templates > Create New. Design your template, save it, and reuse for similar projects. Share with team members.", + "agent_turns": ["creation", "saving", "sharing"] + }, + { + "input": "What analytics are available?", + "output": "Real-time dashboards, custom reports, trend analysis, user behavior tracking, and export capabilities for deeper analysis.", + "agent_turns": ["dashboards", "reports", "tracking"] + }, + { + "input": "How do I backup my data?", + "output": "Automatic daily backups included. Manual backups available in Settings > Backup. Download backup files anytime.", + "agent_turns": ["automatic", "manual", "access"] + }, + { + "input": "What integrations are available?", + "output": "Slack, Google Workspace, Microsoft 365, Salesforce, Zapier, and 100+ more apps via our API marketplace.", + "agent_turns": ["popular_apps", "api", "marketplace"] + }, + { + "input": "How do I search my data?", + "output": "Use the search bar with filters for date, type, and keywords. Advanced search supports boolean operators and saved searches.", + "agent_turns": ["basic_search", "filters", "advanced"] + }, + { + "input": "What customer support levels are there?", + "output": "Basic: email support (48h response). Pro: email + chat (24h). Enterprise: phone + dedicated account manager (1h).", + "agent_turns": ["levels", "response_times", "premium"] + }, + { + "input": "How do I set up automation?", + "output": "Go to Automation > Create Rule. Set triggers, conditions, and actions. Test rules before activating. Monitor execution logs.", + "agent_turns": ["creation", "testing", "monitoring"] + }, + { + "input": "What are the system requirements?", + "output": "Web: modern browser. Mobile: iOS 12+ or Android 8+. Desktop: Windows 10+, macOS 10.14+. 2GB RAM minimum.", + "agent_turns": ["web", "mobile", "desktop"] + }, + { + "input": "How do I share files?", + "output": "Click Share on any file. Generate link or invite via email. Set permissions and expiration dates. Track access.", + "agent_turns": ["sharing", "permissions", "tracking"] + }, + { + "input": "What collaboration features exist?", + "output": "Real-time editing, comments, version history, task assignment, and team workspaces with granular permissions.", + "agent_turns": ["editing", "communication", "organization"] + }, + { + "input": "How do I manage subscriptions?", + "output": "View all subscriptions in Account Settings. Pause, cancel, or modify anytime. Upgrade/downgrade with prorated billing.", + "agent_turns": ["management", "changes", "billing"] + }, + { + "input": "What is your roadmap?", + "output": "Q1: AI features. Q2: mobile improvements. Q3: advanced analytics. Q4: enterprise features. Public roadmap available.", + "agent_turns": ["timeline", "features", "transparency"] + }, + { + "input": "How do I provide feedback?", + "output": "Use the feedback button in-app, email feedback@example.com, or join our community forum. We respond to all feedback.", + "agent_turns": ["channels", "community", "response"] + }, + { + "input": "What compliance standards do you follow?", + "output": "GDPR, CCPA, HIPAA, SOC 2 Type II, ISO 27001. Regular third-party audits and compliance reports available.", + "agent_turns": ["standards", "audits", "documentation"] + }, + { + "input": "How do I import data?", + "output": "Support CSV, Excel, JSON formats. Go to Import > Select file > Map fields > Preview > Import. Large files processed in background.", + "agent_turns": ["formats", "process", "large_files"] + }, + { + "input": "What is your pricing model?", + "output": "Free: $0/month. Pro: $9.99/month. Team: $19.99/user/month. Enterprise: custom pricing. Annual billing saves 20%.", + "agent_turns": ["tiers", "pricing", "discounts"] + }, + { + "input": "How do I manage permissions?", + "output": "Role-based access control. Create custom roles, set granular permissions, and assign to users. Audit trail available.", + "agent_turns": ["roles", "granularity", "auditing"] + }, + { + "input": "What monitoring tools are included?", + "output": "Performance metrics, error tracking, uptime monitoring, user analytics, and custom alerts via email or webhook.", + "agent_turns": ["metrics", "tracking", "alerts"] + }, + { + "input": "How do I configure email settings?", + "output": "Go to Settings > Email. Configure SMTP, set signatures, create templates, and manage auto-responders. Test before saving.", + "agent_turns": ["configuration", "templates", "testing"] + }, + { + "input": "What is your disaster recovery plan?", + "output": "Multiple data centers, automated failover, daily backups, 99.99% recovery time objective. Full recovery within 4 hours.", + "agent_turns": ["infrastructure", "backups", "recovery_time"] + }, + { + "input": "How do I use the API?", + "output": "RESTful API with comprehensive documentation. API keys in settings. Rate limits apply. SDKs available for Python, JS, Java.", + "agent_turns": ["documentation", "authentication", "sdks"] + }, + { + "input": "What reporting features exist?", + "output": "Custom reports, scheduled reports, dashboards, data visualization, and export to multiple formats. Real-time updates.", + "agent_turns": ["customization", "scheduling", "visualization"] + }, + { + "input": "How do I manage storage?", + "output": "View usage in Account Settings. Upgrade storage or archive old data. Automatic cleanup rules available. Compression enabled.", + "agent_turns": ["monitoring", "upgrades", "automation"] + }, + { + "input": "What is your SLA?", + "output": "99.9% uptime for paid plans, 99.5% for free. Support response: 1h enterprise, 24h pro, 48h free. Credit for downtime.", + "agent_turns": ["uptime", "support", "compensation"] + }, + { + "input": "How do I set up SSO?", + "output": "Go to Security > SSO Configuration. Support SAML 2.0, OAuth 2.0, and LDAP. Test connection before enabling for all users.", + "agent_turns": ["protocols", "testing", "rollout"] + }, + { + "input": "What mobile features are available?", + "output": "Full app functionality, offline mode, push notifications, biometric authentication, and tablet-optimized interface.", + "agent_turns": ["functionality", "offline", "security"] + }, + { + "input": "How do I create workflows?", + "output": "Use the workflow builder to create multi-step processes. Add conditions, branches, and automation. Test before deployment.", + "agent_turns": ["builder", "logic", "testing"] + }, + { + "input": "What is your data processing policy?", + "output": "Data processed in secure data centers. No data sold to third parties. Anonymized data may be used for service improvement.", + "agent_turns": ["security", "privacy", "improvement"] + }, + { + "input": "How do I manage user accounts?", + "output": "Bulk import via CSV, individual creation, or SSO sync. Set roles, permissions, and access levels. Deactivate unused accounts.", + "agent_turns": ["creation", "permissions", "maintenance"] + }, + { + "input": "What customization options exist?", + "output": "Custom branding, themes, layouts, workflows, and fields. White-label options for enterprise plans.", + "agent_turns": ["branding", "themes", "enterprise"] + }, + { + "input": "How do I troubleshoot issues?", + "output": "Check status page, review error logs, clear cache, update browser/app. Contact support with error details and steps.", + "agent_turns": ["self_help", "diagnostics", "escalation"] + }, + { + "input": "What is your deprecation policy?", + "output": "6 months notice for feature deprecation. 12 months for API changes. Migration guides and support provided.", + "agent_turns": ["notice_period", "support", "migration"] + }, + { + "input": "How do I manage notifications?", + "output": "Configure email, SMS, push, in-app notifications. Set frequency, quiet hours, and priority levels. Custom rules available.", + "agent_turns": ["channels", "frequency", "prioritization"] + }, + { + "input": "What testing tools are included?", + "output": "A/B testing, feature flags, beta testing, sandbox environment, and performance monitoring. Gradual rollout support.", + "agent_turns": ["testing_types", "environments", "rollout"] + }, + { + "input": "How do I optimize performance?", + "output": "Enable caching, optimize queries, use CDN, compress images, and monitor metrics. Performance reports available weekly.", + "agent_turns": ["optimization", "monitoring", "reporting"] + }, + { + "input": "What is your incident response process?", + "output": "Immediate detection, assessment within 15min, resolution within 4h, post-incident review within 48h. Communication updates hourly.", + "agent_turns": ["detection", "resolution", "communication"] + }, + { + "input": "How do I manage licenses?", + "output": "View license usage, assign/unassign seats, bulk purchase, and set auto-renewal. License pool optimization available.", + "agent_turns": ["tracking", "assignment", "automation"] + }, + { + "input": "What is your change management process?", + "output": "Bi-weekly releases, feature flags, gradual rollouts, rollback capability, and advance notifications for breaking changes.", + "agent_turns": ["schedule", "safety", "communication"] + }, + { + "input": "How do I configure integrations?", + "output": "Browse integration marketplace, click Connect, authenticate via OAuth, configure settings, and test connection. Monitor status.", + "agent_turns": ["discovery", "setup", "monitoring"] + }, + { + "input": "What documentation is available?", + "output": "API docs, user guides, video tutorials, FAQ, community forum, and developer blog. Updated weekly with new features.", + "agent_turns": ["formats", "community", "updates"] + }, + { + "input": "How do I manage access tokens?", + "output": "Generate API tokens in settings, set expiration dates, scope permissions, and rotate regularly. Audit token usage logs.", + "agent_turns": ["generation", "security", "auditing"] + }, + { + "input": "What is your scalability approach?", + "output": "Auto-scaling infrastructure, load balancing, database sharding, CDN distribution, and geographic redundancy.", + "agent_turns": ["infrastructure", "performance", "reliability"] + }, + { + "input": "How do I handle large datasets?", + "output": "Chunked processing, streaming uploads, background jobs, progress tracking, and result notifications. Optimized for millions of records.", + "agent_turns": ["processing", "user_experience", "scale"] + }, + { + "input": "What is your code quality process?", + "output": "Code reviews, automated testing, CI/CD pipeline, static analysis, security scanning, and performance benchmarking.", + "agent_turns": ["reviews", "testing", "automation"] + }, + { + "input": "How do I manage feature flags?", + "output": "Create flags in settings, define targeting rules, set rollout percentages, monitor metrics, and schedule automatic rollouts.", + "agent_turns": ["creation", "targeting", "monitoring"] + }, + { + "input": "What is your user research process?", + "output": "User interviews, surveys, usability testing, analytics review, and feedback synthesis. Research shared with product team monthly.", + "agent_turns": ["methods", "frequency", "sharing"] + }, + { + "input": "How do I optimize costs?", + "output": "Monitor usage, set budgets, enable auto-scaling, archive old data, and review subscription tiers regularly.", + "agent_turns": ["monitoring", "automation", "optimization"] + }, + { + "input": "What is your competitive advantage?", + "output": "Superior user experience, comprehensive features, excellent support, competitive pricing, and continuous innovation.", + "agent_turns": ["product", "service", "value"] + }, + { + "input": "How do I manage dependencies?", + "output": "Track software dependencies, update regularly, security scan weekly, test compatibility, and maintain documentation.", + "agent_turns": ["tracking", "updates", "security"] + }, + { + "input": "What is your innovation strategy?", + "output": "User-driven development, rapid prototyping, A/B testing, data-driven decisions, and quarterly innovation sprints.", + "agent_turns": ["process", "testing", "timing"] + }, + { + "input": "How do I handle edge cases?", + "output": "Comprehensive error handling, fallback mechanisms, user-friendly messages, logging for debugging, and graceful degradation.", + "agent_turns": ["prevention", "recovery", "debugging"] + }, + { + "input": "What is your team structure?", + "output": "Cross-functional teams with product, engineering, design, and support. Agile methodology with 2-week sprints.", + "agent_turns": ["organization", "methodology", "timing"] + }, + { + "input": "How do I ensure accessibility?", + "output": "WCAG 2.1 AA compliance, keyboard navigation, screen reader support, high contrast themes, and regular accessibility audits.", + "agent_turns": ["standards", "features", "auditing"] + }, + { + "input": "What is your learning platform?", + "output": "Interactive tutorials, video courses, certification program, webinars, and community knowledge sharing.", + "agent_turns": ["content", "certification", "community"] + }, + { + "input": "How do I manage beta programs?", + "output": "Invite-only beta testing, feedback collection, bug bounty program, early access features, and beta community forums.", + "agent_turns": ["selection", "feedback", "community"] + }, + { + "input": "What is your sustainability commitment?", + "output": "Carbon-neutral data centers, renewable energy, remote-first work, digital-first documentation, and green hosting providers.", + "agent_turns": ["infrastructure", "work_policy", "environment"] + }, + { + "input": "How do I handle internationalization?", + "output": "Multi-language support, currency conversion, timezone handling, cultural adaptation, and regional compliance.", + "agent_turns": ["languages", "localization", "compliance"] + }, + { + "input": "What is your partnership program?", + "output": "Technology partners, reseller program, integration marketplace, co-marketing opportunities, and revenue sharing models.", + "agent_turns": ["types", "benefits", "compensation"] + }, + { + "input": "How do I manage quality assurance?", + "output": "Automated testing, manual QA, performance testing, security testing, user acceptance testing, and continuous monitoring.", + "agent_turns": ["testing", "performance", "security"] + }, + { + "input": "What is your vision for the future?", + "output": "AI-powered automation, seamless integrations, predictive analytics, global expansion, and industry-specific solutions.", + "agent_turns": ["technology", "growth", "specialization"] + } +] \ No newline at end of file diff --git a/data-pipeline/main.py b/data-pipeline/main.py new file mode 100644 index 0000000..8aaacbb --- /dev/null +++ b/data-pipeline/main.py @@ -0,0 +1,71 @@ +import os +import sys + +# Add src to path for imports +sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) + +from src.ingest import load_logs +from src.process import process_logs +from src.score import score_entry +from src.improve import add_optimization_hint +from src.export import export_jsonl +from src.select import select_diverse + +def main(): + # 1. Load logs + raw_logs = load_logs('data/raw_logs.json') + + # 2. Clean PII and process logs + processed_logs = process_logs(raw_logs) + + # 3. Add optimization hints + processed_logs = [add_optimization_hint(log) for log in processed_logs] + + # 4. Score entries + scored_logs = [] + for log in processed_logs: + score = score_entry(log) + log['score'] = score + scored_logs.append(log) + + # 5. Separate entries by quality type + good_entries = [log for log in scored_logs if log['quality_type'] == 'good'] + bad_entries = [log for log in scored_logs if log['quality_type'] in ['weak', 'bad']] + + # 6. Logging + inefficient_count = sum(1 for log in scored_logs if log.get('inefficient', False)) + weak_count = sum(1 for log in scored_logs if log.get('quality_type') == 'weak') + good_quality_count = sum(1 for log in scored_logs if log.get('quality_type') == 'good') + bad_quality_count = sum(1 for log in scored_logs if log.get('quality_type') == 'bad') + + print(f"Total logs: {len(scored_logs)}") + + # 7. Apply diversity selection + good_entries = select_diverse(good_entries) + print(f"After diversity filter: {len(good_entries)} entries") + + # 8. Export results + export_jsonl(good_entries, 'data/output.jsonl') + + print(f"\nAfter filtering:") + print(f"- Kept for training (high-quality): {len(good_entries)}") + print(f"- Removed:") + print(f" - Weak: {weak_count}") + print(f" - Bad: {bad_quality_count}") + print(f"\nInefficient entries: {inefficient_count}") + + # Show unique quality reasons for removed entries + reason_counts = {} + for log in scored_logs: + if log.get('quality_type') in ['bad', 'weak']: + reason = log.get('quality_reason', 'no reason') + reason_counts[reason] = reason_counts.get(reason, 0) + 1 + + print("\nRemoved entries summary:") + for reason, count in reason_counts.items(): + print(f" - {reason} ({count})") + + print(f"\nExported to data/output.jsonl") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/data-pipeline/src/clean.py b/data-pipeline/src/clean.py new file mode 100644 index 0000000..2fb1818 --- /dev/null +++ b/data-pipeline/src/clean.py @@ -0,0 +1,13 @@ +import re + +def remove_pii(text): + """Remove PII from text with government ID support.""" + # Replace 10 digit numbers with + text = re.sub(r'\b\d{10}\b', '', text) + # Replace emails with + text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text) + # Replace government IDs (Aadhaar format) + text = re.sub(r'\b\d{4}\s?\d{4}\s?\d{4}\b', '', text) + # Replace PAN card format + text = re.sub(r'\b[A-Z]{5}\d{4}[A-Z]\b', '', text) + return text \ No newline at end of file diff --git a/data-pipeline/src/export.py b/data-pipeline/src/export.py new file mode 100644 index 0000000..7c6cc9a --- /dev/null +++ b/data-pipeline/src/export.py @@ -0,0 +1,27 @@ +import json + +def format_chat_template(entry): + """Format entry for chat template (LoRA-ready).""" + messages = [ + {"role": "user", "content": entry['input']}, + {"role": "assistant", "content": entry['output']} + ] + + return { + "messages": messages, + "metadata": { + "score": entry.get('score', 0), + "steps": entry.get('steps', 0), + "inefficient": entry.get('inefficient', False), + "optimization_suggestion": entry.get('optimization_suggestion', 'Unknown'), + "quality_type": entry.get('quality_type', 'unknown'), + "quality_reason": entry.get('quality_reason', 'no reason') + } + } + +def export_jsonl(data, path): + """Export data to JSONL file with chat template.""" + with open(path, 'w', encoding='utf-8') as f: + for item in data: + formatted = format_chat_template(item) + f.write(json.dumps(formatted) + '\n') \ No newline at end of file diff --git a/data-pipeline/src/improve.py b/data-pipeline/src/improve.py new file mode 100644 index 0000000..eb6321e --- /dev/null +++ b/data-pipeline/src/improve.py @@ -0,0 +1,8 @@ +def add_optimization_hint(entry): + """Add optimization suggestion based on efficiency.""" + if entry.get('inefficient'): + entry['optimization_suggestion'] = "Reduce unnecessary steps or tool calls" + else: + entry['optimization_suggestion'] = "Efficient" + + return entry diff --git a/data-pipeline/src/ingest.py b/data-pipeline/src/ingest.py new file mode 100644 index 0000000..56526df --- /dev/null +++ b/data-pipeline/src/ingest.py @@ -0,0 +1,6 @@ +import json + +def load_logs(path): + """Load logs from JSON file.""" + with open(path, 'r', encoding='utf-8') as f: + return json.load(f) \ No newline at end of file diff --git a/data-pipeline/src/process.py b/data-pipeline/src/process.py new file mode 100644 index 0000000..1933a09 --- /dev/null +++ b/data-pipeline/src/process.py @@ -0,0 +1,28 @@ +from .clean import remove_pii + +def detect_inefficiency(entry): + """Detect if entry is inefficient based on steps.""" + steps = entry.get("steps", 0) + return steps > 3 + +def process_logs(logs): + """Process logs into structured format.""" + processed = [] + for log in logs: + # Clean input and output + input_text = remove_pii(log.get('input', '')) + output_text = remove_pii(log.get('output', '')) + + # Calculate steps + steps = len(log.get('agent_turns', [])) if 'agent_turns' in log else 1 + + # Detect inefficiency + inefficient = detect_inefficiency({'steps': steps}) + + processed.append({ + 'input': input_text, + 'output': output_text, + 'steps': steps, + 'inefficient': inefficient + }) + return processed \ No newline at end of file diff --git a/data-pipeline/src/score.py b/data-pipeline/src/score.py new file mode 100644 index 0000000..5ca91a3 --- /dev/null +++ b/data-pipeline/src/score.py @@ -0,0 +1,85 @@ +def is_bad(entry): + """Hard filters for bad entries.""" + output = entry["output"].strip().lower() + + # Hard rules - always bad + if output == "": + entry['quality_reason'] = 'empty response' + return True + if output in ["ok", "yes", "done"]: + entry['quality_reason'] = 'minimal response' + return True + if output in ["err", "error", "failed"]: + entry['quality_reason'] = 'error only' + return True + + # Specific failure patterns (not just mentioning "error") + bad_patterns = ["error fetching", "unable to", "failed to", "exception"] + for pattern in bad_patterns: + if pattern in output: + entry['quality_reason'] = f'failure: {pattern}' + return True + + return False + +def is_weak(entry): + """Detect weak/low-quality answers.""" + output = entry["output"].strip().lower() + + # Too vague - too short + if len(output) < 20 and output not in ["err", "error", "failed"]: + entry['quality_reason'] = 'too brief' + return True + + # Generic phrases that indicate uncertainty or vagueness + weak_phrases = ["it might", "it could", "maybe", "not sure", "possibly", "probably"] + if any(p in output for p in weak_phrases): + entry['quality_reason'] = 'uncertain language' + return True + + return False + +def score_entry(entry): + """Score an entry with hard filters and weak detection.""" + # Apply hard filters first + if is_bad(entry): + entry['quality_type'] = 'bad' + return -1 # Force bad + + score = 0 + + # Check for weak answers + if is_weak(entry): + score -= 1 + entry['quality_type'] = 'weak' + else: + entry['quality_type'] = 'good' + entry['quality_reason'] = 'good response' + + # Base quality metrics + if len(entry.get('output', '')) > 20: + score += 1 + + if entry.get('steps', 0) <= 3: + score += 1 + + # Persona compliance (helpful, professional, clear) + output = entry.get('output', '').lower() + persona_keywords = ['please', 'thank you', 'here is', 'you can', 'let me help'] + if any(keyword in output for keyword in persona_keywords): + score += 1 + + # Trajectory efficiency bonus + trajectory = entry.get('trajectory_complexity', {}) + if not trajectory.get('has_errors', True): + score += 1 + + # Tool consistency check + if entry.get('tool_consistency', False): + score += 1 + + # Inefficiency penalty + if entry.get('inefficient', False): + score -= 1 + + return score \ No newline at end of file diff --git a/data-pipeline/src/select.py b/data-pipeline/src/select.py new file mode 100644 index 0000000..6a0b28e --- /dev/null +++ b/data-pipeline/src/select.py @@ -0,0 +1,29 @@ +def select_diverse(entries): + """Remove duplicate entries, keep first occurrence.""" + seen_inputs = set() + diverse_entries = [] + + for entry in entries: + # Better normalization for diversity filtering + input_text = entry.get('input', '').strip().lower().replace('?', '') + if input_text not in seen_inputs: + seen_inputs.add(input_text) + diverse_entries.append(entry) + + return diverse_entries + +def balance_by_steps(entries): + simple = [e for e in entries if e.get('steps', 0) <= 2] + complex = [e for e in entries if e.get('steps', 0) > 2] + + # Take equal amounts from both groups + min_count = min(len(simple), len(complex)) + balanced = simple[:min_count] + complex[:min_count] + + # Add remaining entries if one group is larger + if len(simple) > len(complex): + balanced.extend(simple[min_count:min_count + (len(simple) - len(complex))]) + elif len(complex) > len(simple): + balanced.extend(complex[min_count:min_count + (len(complex) - len(simple))]) + + return balanced