diff --git a/.cursor/debug/CLI-GUIDE.md b/.cursor/debug/CLI-GUIDE.md deleted file mode 100644 index 4f36ef5..0000000 --- a/.cursor/debug/CLI-GUIDE.md +++ /dev/null @@ -1,364 +0,0 @@ -# Pickbox CLI Guide - -This guide covers the new Cobra-based CLI for Pickbox, a distributed file storage system. - -## Installation - -### Build from Source -```bash -# Clone the repository -git clone https://github.com/addityasingh/pickbox -cd pickbox - -# Build the CLI -make build - -# Install to PATH (optional) -make install -``` - -### Quick Start -```bash -# Show help -./bin/pickbox --help - -# Start a single node cluster -./bin/pickbox node start --node-id node1 --port 8001 --bootstrap - -# Start a 3-node cluster quickly -make demo-3-nodes -``` - -## Command Structure - -The CLI is organized into logical command groups: - -``` -pickbox -├── node # Node management -│ ├── start # Start full-featured node -│ └── multi # Start multi-directional replication node -├── cluster # Cluster management -│ ├── join # Join node to cluster -│ └── status # Check cluster status -└── script # Common operations - ├── demo-3-nodes # Demo 3-node cluster - └── cleanup # Clean up data -``` - -## Commands - -### Node Commands - -#### Start Full-Featured Node -```bash -pickbox node start [flags] -``` - -**Flags:** -- `--node-id, -n`: Node ID (required) -- `--port, -p`: Raft port (default: 8001) -- `--admin-port`: Admin API port (default: 9001) -- `--monitor-port`: Monitor port (default: 9002) -- `--dashboard-port`: Dashboard port (default: 9003) -- `--join, -j`: Address of node to join -- `--bootstrap, -b`: Bootstrap new cluster -- `--data-dir, -d`: Data directory (default: "data") -- `--log-level, -l`: Log level (default: "info") - -**Examples:** -```bash -# Bootstrap a new cluster -pickbox node start --node-id node1 --port 8001 --bootstrap - -# Join an existing cluster -pickbox node start --node-id node2 --port 8002 --join 127.0.0.1:8001 - -# Custom ports and directories -pickbox node start --node-id node3 --port 8003 --admin-port 9010 --data-dir /tmp/pickbox -``` - -#### Start Multi-Directional Replication Node -```bash -pickbox node multi [flags] -``` - -**Flags:** -- `--node-id, -n`: Node ID (required) -- `--port, -p`: Port (default: 8001) -- `--join, -j`: Address of node to join - -**Features:** -- Multi-directional file replication (edit files on any node!) -- Real-time file watching and replication -- Automatic leader forwarding for non-leader nodes -- Raft consensus for consistency - -**Examples:** -```bash -# Start multi-directional replication node -pickbox node multi --node-id multi1 --port 8010 - -# Join existing multi-directional cluster -pickbox node multi --node-id multi2 --port 8011 --join 127.0.0.1:8010 -``` - -### Cluster Commands - -#### Join Node to Cluster -```bash -pickbox cluster join [flags] -``` - -**Flags:** -- `--leader, -l`: Leader address (required) -- `--node-id, -n`: Node ID to join (required) -- `--node-addr, -a`: Node address (required) - -**Examples:** -```bash -# Join node to cluster -pickbox cluster join --leader 127.0.0.1:8001 --node-id node4 --node-addr 127.0.0.1:8004 -``` - -#### Check Cluster Status -```bash -pickbox cluster status [flags] -``` - -**Flags:** -- `--addr, -a`: Admin address to check (default: "127.0.0.1:9001") - -**Examples:** -```bash -# Check default cluster status -pickbox cluster status - -# Check specific admin server -pickbox cluster status --addr 127.0.0.1:9002 -``` - -### Script Commands - -#### Demo 3-Node Cluster -```bash -pickbox script demo-3-nodes -``` - -Automatically: -- Cleans up old data -- Starts node1 as bootstrap -- Starts node2 and node3 joining the cluster -- Shows access URLs and data directories - -#### Cleanup Data -```bash -pickbox script cleanup -``` - -Removes all data directories from previous runs. - -## Common Use Cases - -### 1. Quick Testing (3-Node Cluster) -```bash -# Start demo cluster -make demo-3-nodes - -# Or manually -pickbox script demo-3-nodes - -# Access URLs will be shown: -# - Admin APIs: http://localhost:9001, 9002, 9003 -# - Dashboards: http://localhost:9003, 9006, 9009 -# - Data dirs: data/node1, data/node2, data/node3 -``` - -### 2. Manual Cluster Setup -```bash -# Terminal 1: Start bootstrap node -pickbox node start --node-id node1 --port 8001 --bootstrap - -# Terminal 2: Start second node -pickbox node start --node-id node2 --port 8002 --join 127.0.0.1:8001 - -# Terminal 3: Start third node -pickbox node start --node-id node3 --port 8003 --join 127.0.0.1:8001 -``` - -### 3. Multi-Directional Replication Testing -```bash -# Terminal 1: Start multi-directional node -pickbox node multi --node-id multi1 --port 8010 - -# Terminal 2: Join another multi-directional node -pickbox node multi --node-id multi2 --port 8011 --join 127.0.0.1:8010 - -# Multi-directional: Edit files in data/multi1/ OR data/multi2/ and watch them replicate to all nodes! -# Files can be edited on any node and will automatically replicate to all others -``` - -### 4. Dynamic Cluster Management -```bash -# Add a new node to running cluster -pickbox cluster join --leader 127.0.0.1:8001 --node-id node4 --node-addr 127.0.0.1:8004 - -# Check cluster status -pickbox cluster status --addr 127.0.0.1:9001 -``` - -## Port Allocation - -The CLI uses predictable port allocation: - -**Full Node (node start):** -- Raft port: Specified by `--port` (default: 8001) -- Admin port: Specified by `--admin-port` (default: 9001) -- Monitor port: Specified by `--monitor-port` (default: 9002) -- Dashboard port: Specified by `--dashboard-port` (default: 9003) - -**Multi-Directional Node (node multi):** -- Raft port: Specified by `--port` (default: 8001) -- Admin port: Raft port + 1000 (e.g., 8001 → 9001) - -**Demo 3-Node Cluster:** -- Node1: Raft 8001, Admin 9001, Monitor 9002, Dashboard 9003 -- Node2: Raft 8002, Admin 9002, Monitor 9003, Dashboard 9006 -- Node3: Raft 8003, Admin 9003, Monitor 9004, Dashboard 9009 - -## Data Directories - -By default, data is stored in: -- `data/node1/` for node1 -- `data/node2/` for node2 -- etc. - -Each node's data directory contains: -- File storage -- Raft logs -- Snapshots -- Configuration - -## Monitoring and Admin - -### Admin API -Access admin APIs at `http://localhost:9001` (or specified admin port). - -### Monitoring -Access monitoring at `http://localhost:9002/metrics` (or specified monitor port). - -### Dashboard -Access dashboard at `http://localhost:9003` (or specified dashboard port). - -## File Operations - -### Full Node -- Real-time file watching and replication -- Admin interface for cluster management -- Monitoring and metrics -- Dashboard UI - -### Multi-Directional Node -- Multi-directional file watching and replication -- Real-time file sync across all nodes -- Automatic leader forwarding for consistency -- Basic admin interface -- Optimized for performance - -## Cleanup - -```bash -# Stop all nodes -pkill pickbox - -# Clean up data -pickbox script cleanup - -# Or using make -make demo-cleanup -``` - -## Troubleshooting - -### Common Issues - -1. **Port conflicts**: Use different ports with `--port`, `--admin-port`, etc. -2. **Data directory conflicts**: Use `--data-dir` to specify different directories -3. **Join failures**: Ensure the leader node is running and accessible - -### Debug Mode -```bash -# Enable debug logging -pickbox node start --node-id node1 --log-level debug --bootstrap -``` - -### Check Logs -All nodes log to stdout with structured logging. Look for: -- `🚀` - Node startup -- `👑` - Leadership changes -- `📡` - File replication -- `✅` - Success messages -- `❌` - Errors - -## Migration from Old Commands - -### Old vs New Commands - -| Old Command | New Command | -|-------------|-------------| -| `./bin/multi_replication` | `pickbox node start` | -| `./bin/live_replication` | `pickbox node multi` | -| Custom scripts | `pickbox script demo-3-nodes` | - -### Example Migration -```bash -# Old way -./bin/multi_replication -node node1 -port 8001 -bootstrap - -# New way -pickbox node start --node-id node1 --port 8001 --bootstrap -``` - -## Advanced Usage - -### Custom Configuration -```bash -# Production-like setup -pickbox node start \ - --node-id prod-node1 \ - --port 8001 \ - --admin-port 9001 \ - --monitor-port 9002 \ - --dashboard-port 9003 \ - --data-dir /opt/pickbox/data \ - --log-level info \ - --bootstrap -``` - -### Scripted Deployment -```bash -#!/bin/bash -# Deploy 5-node cluster -for i in {1..5}; do - port=$((8000 + i)) - admin_port=$((9000 + i)) - - if [ $i -eq 1 ]; then - pickbox node start --node-id node$i --port $port --admin-port $admin_port --bootstrap & - else - pickbox node start --node-id node$i --port $port --admin-port $admin_port --join 127.0.0.1:8001 & - fi - - sleep 2 -done -``` - -## Next Steps - -1. Try the quick start: `make demo-3-nodes` -2. Explore the admin interface: `http://localhost:9001` -3. Check the monitoring dashboard: `http://localhost:9003` -4. Test file replication by editing files in `data/node1/` -5. Scale up by adding more nodes with `pickbox cluster join` - -For more information, see the main README and package documentation. \ No newline at end of file diff --git a/.cursor/debug/MULTI-DIRECTIONAL-UPGRADE.md b/.cursor/debug/MULTI-DIRECTIONAL-UPGRADE.md deleted file mode 100644 index 2f258a6..0000000 --- a/.cursor/debug/MULTI-DIRECTIONAL-UPGRADE.md +++ /dev/null @@ -1,196 +0,0 @@ -# Multi-Directional Live Replication Upgrade - -## Summary - -The live replication functionality in the Pickbox CLI has been upgraded to support **multi-directional replication**, making it consistent with the `multi_replication` implementation. This upgrade enables file changes to be detected and replicated from any node in the cluster, not just the leader. - -## Key Changes - -### 🔄 Multi-Directional Replication -- **Before**: Only the leader node could detect file changes and replicate them -- **After**: Any node can detect file changes and they'll be replicated to all other nodes -- **Implementation**: Uses the same modular components as `multi_replication` - -### 🏗️ Architecture Upgrade - -#### Old Implementation (Leader-Only) -```go -// Custom FSM with basic file watching -type LiveFSM struct { - dataDir string - watcher *fsnotify.Watcher - raft *raft.Raft - isLeader bool -} - -// Only watched files when node was leader -if r.State() == raft.Leader { - // Process file changes -} -``` - -#### New Implementation (Multi-Directional) -```go -// Modular components like multi_replication -type LiveApplication struct { - config LiveConfig - logger *logrus.Logger - raftManager *storage.RaftManager - stateManager *watcher.DefaultStateManager - fileWatcher *watcher.FileWatcher - adminServer *admin.Server -} - -// File watcher works on all nodes with leader forwarding -app.fileWatcher, err = watcher.NewFileWatcher( - watcherConfig, - &liveRaftWrapper{app.raftManager}, - app.stateManager, - &liveForwarderWrapper{app.logger}, -) -``` - -### 🔧 Technical Improvements - -1. **Modular Components**: Now uses the same proven components as `multi_replication`: - - `storage.RaftManager` for Raft operations - - `watcher.FileWatcher` for multi-directional file watching - - `watcher.DefaultStateManager` for state management - - `admin.Server` for cluster management - -2. **Leader Forwarding**: Non-leader nodes can detect file changes and forward them to the leader: - ```go - type liveForwarderWrapper struct { - logger *logrus.Logger - } - - func (fw *liveForwarderWrapper) ForwardToLeader(leaderAddr string, cmd watcher.Command) error { - // Convert and forward command to leader - return admin.ForwardToLeader(adminAddr, adminCmd) - } - ``` - -3. **Enhanced Monitoring**: Better logging and leadership monitoring: - ```go - if isLeader && !wasLeader { - app.logger.Infof("👑 %s became leader - multi-directional replication active", app.config.NodeID) - } else if !isLeader && wasLeader { - app.logger.Infof("👥 %s is now a follower - forwarding changes to leader", app.config.NodeID) - } - ``` - -## User Benefits - -### 🎯 Improved User Experience -- **Edit files anywhere**: Users can edit files on any node and see them replicate to all others -- **No leader dependency**: File changes work regardless of which node the user is on -- **Consistent behavior**: Live replication now works the same as full node replication - -### 📊 Enhanced Functionality -- **Real-time sync**: Files are immediately synchronized across all nodes -- **Automatic failover**: If the leader changes, replication continues seamlessly -- **Better debugging**: Enhanced logging shows replication status and leader changes - -## Usage Examples - -### Before (Leader-Only) -```bash -# Start nodes -pickbox node live --node-id live1 --port 8010 # Leader -pickbox node live --node-id live2 --port 8011 --join 127.0.0.1:8010 # Follower - -# Only editing files in data/live1/ would replicate to data/live2/ -# Editing files in data/live2/ would NOT replicate anywhere -``` - -### After (Multi-Directional) -```bash -# Start nodes -pickbox node live --node-id live1 --port 8010 # Leader -pickbox node live --node-id live2 --port 8011 --join 127.0.0.1:8010 # Follower - -# Editing files in data/live1/ replicates to data/live2/ ✅ -# Editing files in data/live2/ replicates to data/live1/ ✅ -# All file changes are synchronized across all nodes! 🎉 -``` - -## Technical Details - -### File Change Detection Flow -1. **File Change**: User edits a file on any node -2. **Detection**: `watcher.FileWatcher` detects the change -3. **Leadership Check**: - - If leader: Apply directly through Raft - - If follower: Forward to leader via `liveForwarderWrapper` -4. **Replication**: Leader applies change and replicates to all followers -5. **Consistency**: All nodes have the same file content - -### Components Integration -```go -// Raft operations -liveRaftWrapper -> storage.RaftManager -> raft.Raft - -// File watching -watcher.FileWatcher -> liveRaftWrapper (leader) or liveForwarderWrapper (follower) - -// Admin operations -admin.Server -> admin.RequestJoinCluster -> admin.ForwardToLeader -``` - -## Testing - -### Multi-Directional Test -```bash -# Terminal 1: Start bootstrap node -pickbox node live --node-id live1 --port 8010 - -# Terminal 2: Join second node -pickbox node live --node-id live2 --port 8011 --join 127.0.0.1:8010 - -# Terminal 3: Test multi-directional replication -echo "From node1" > data/live1/test.txt -echo "From node2" > data/live2/test2.txt - -# Both files should appear in both data directories! -ls data/live1/ # Should show: test.txt, test2.txt -ls data/live2/ # Should show: test.txt, test2.txt -``` - -### Performance Test -```bash -# Create multiple files on different nodes simultaneously -for i in {1..10}; do - echo "File $i from node1" > data/live1/file$i.txt & - echo "File $i from node2" > data/live2/file$i.txt & -done - -# All files should replicate to all nodes consistently -``` - -## Backwards Compatibility - -- **CLI Interface**: No changes to command-line interface -- **Configuration**: Same flags and options -- **Data Format**: Compatible with existing data directories -- **Migration**: Existing clusters can be upgraded without data loss - -## Future Enhancements - -1. **Conflict Resolution**: Handle simultaneous edits to the same file -2. **File Locking**: Prevent concurrent modifications -3. **Incremental Sync**: Only sync changed portions of files -4. **Compression**: Compress data during replication -5. **Metrics**: Add replication performance metrics - -## Conclusion - -The multi-directional live replication upgrade brings the live replication functionality in line with the full-featured multi-replication implementation. Users can now edit files on any node and have them automatically replicate to all other nodes in the cluster, providing a seamless distributed file system experience. - -### Key Benefits: -- ✅ Multi-directional file replication -- ✅ Automatic leader forwarding -- ✅ Enhanced monitoring and logging -- ✅ Consistent with multi_replication behavior -- ✅ No breaking changes to CLI interface - -This upgrade makes the Pickbox distributed file storage system more intuitive and powerful for distributed development and deployment scenarios. \ No newline at end of file diff --git a/.cursor/debug/architecture_evolution_overview.md b/.cursor/debug/architecture_evolution_overview.md index 8500bff..fb8c0a8 100644 --- a/.cursor/debug/architecture_evolution_overview.md +++ b/.cursor/debug/architecture_evolution_overview.md @@ -1,130 +1,298 @@ -# Pickbox Architecture Evolution Overview +# Architecture Evolution Overview: Pickbox Distributed Storage System -This document traces the evolution of the Pickbox distributed storage system through its development phases, showing how the architecture has grown from basic replication to a sophisticated multi-directional file synchronization system. +## Executive Summary -## Evolution Summary +This document provides a comprehensive overview of the architectural evolution of the Pickbox distributed storage system through three distinct phases, each building upon the previous to create a production-ready, multi-directional file replication system with strong consistency guarantees. -### Current Architecture (Unified CLI) -- **Implementation**: `cmd/pickbox/` - Unified CLI with multiple node modes -- **Key Features**: - - Single binary with subcommands (`pickbox node start`, `pickbox node multi`, `pickbox cluster`) - - Multi-directional replication with real-time file watching - - Comprehensive monitoring and dashboard - - Admin interface with forwarding capabilities - - Enhanced testing and documentation +## Evolution Overview Diagram -### Legacy Evolution Path -The current unified implementation evolved from multiple standalone applications: +```mermaid +graph TB + subgraph "Architecture Evolution: From Basic Consensus to Multi-Directional Replication" + + subgraph "Step 1: Basic Raft Consensus Foundation" + S1_TITLE[Step 1: Manual Consensus-Based Replication] + S1_LEADER[Node1: Leader Only] + S1_FOLLOWERS[Node2 & Node3: Followers Only] + S1_MANUAL[Manual File Operations] + S1_CONSENSUS[Raft Consensus Protocol] + + S1_TITLE --> S1_LEADER + S1_TITLE --> S1_FOLLOWERS + S1_TITLE --> S1_MANUAL + S1_TITLE --> S1_CONSENSUS + end + + subgraph "Step 2: Live File Watching Addition" + S2_TITLE[Step 2: Leader-Initiated Live Replication] + S2_WATCHER[File Watcher on Leader Only] + S2_FSM[Custom Raft FSM for File Ops] + S2_AUTO[Automatic Replication] + S2_ADMIN[Admin Interface] + + S2_TITLE --> S2_WATCHER + S2_TITLE --> S2_FSM + S2_TITLE --> S2_AUTO + S2_TITLE --> S2_ADMIN + end + + subgraph "Step 3: Multi-Directional Revolution" + S3_TITLE[Step 3: Universal Multi-Directional Replication] + S3_WATCHERS[File Watchers on All Nodes] + S3_FORWARD[Follower Forwarding System] + S3_DEDUP[Content Hash Deduplication] + S3_SMART[Smart State Management] + + S3_TITLE --> S3_WATCHERS + S3_TITLE --> S3_FORWARD + S3_TITLE --> S3_DEDUP + S3_TITLE --> S3_SMART + end + + %% Evolution Flow + S1_TITLE -->|"Add Live Watching"| S2_TITLE + S2_TITLE -->|"Enable Multi-Direction"| S3_TITLE + + %% Feature Evolution + S1_CONSENSUS -->|"Enhanced with FSM"| S2_FSM + S2_FSM -->|"Enhanced with Deduplication"| S3_DEDUP + + S1_MANUAL -->|"Automated"| S2_AUTO + S2_AUTO -->|"Multi-Directional"| S3_SMART + + S1_LEADER -->|"Add File Watching"| S2_WATCHER + S2_WATCHER -->|"Expand to All Nodes"| S3_WATCHERS + + %% Capability Matrix + subgraph "Capability Comparison" + FEATURE1[Manual Operations] + FEATURE2[Live Detection] + FEATURE3[Multi-Direction] + FEATURE4[Loop Prevention] + FEATURE5[Content Deduplication] + FEATURE6[Concurrent Users] + + STEP1_CAP[Step 1: ✅ ❌ ❌ ❌ ❌ ❌] + STEP2_CAP[Step 2: ✅ ✅ ❌ ⚠️ ❌ ❌] + STEP3_CAP[Step 3: ✅ ✅ ✅ ✅ ✅ ✅] + end + + FEATURE1 --> STEP1_CAP + FEATURE2 --> STEP2_CAP + FEATURE3 --> STEP3_CAP + end +``` + +## Evolution Timeline + +### Phase 1: Foundation (Basic Raft Replication) +**Implementation**: `cmd/replication/main.go` +**Goal**: Establish distributed consensus foundation +**Achievement**: Strong consistency with manual operations + +### Phase 2: Automation (Live Replication) +**Implementation**: [DELETED] +**Goal**: Add real-time file monitoring and automatic replication +**Achievement**: Leader-initiated live file synchronization + +### Phase 3: Multi-Directional (Advanced Replication) +**Implementation**: `cmd/multi_replication/main.go` +**Goal**: Enable any-node-to-all-nodes replication with deduplication +**Achievement**: Production-ready distributed file system + +## Architectural Comparison Matrix + +| Feature | Step 1 | Step 2 | Step 3 | +|---------|--------|--------|--------| +| **Distributed Consensus** | ✅ | ✅ | ✅ | +| **Manual File Operations** | ✅ | ✅ | ✅ | +| **Live File Detection** | ❌ | ✅ | ✅ | +| **Automatic Replication** | ❌ | ✅ | ✅ | +| **Multi-Directional Flow** | ❌ | ❌ | ✅ | +| **Content Deduplication** | ❌ | ❌ | ✅ | +| **Loop Prevention** | ❌ | ⚠️ Global | ✅ Per-Node | +| **Concurrent Users** | ❌ | ❌ | ✅ | +| **Follower Initiation** | ❌ | ❌ | ✅ | +| **Smart State Management** | ❌ | ❌ | ✅ | + +## Technical Evolution + +### Core Technology Stack +- **Language**: Go 1.21+ +- **Consensus**: HashiCorp Raft +- **File Watching**: fsnotify library +- **Storage**: BoltDB (Raft state) +- **Network**: TCP (Raft + Admin) +- **Hashing**: SHA-256 (content deduplication) + +### Key Architectural Decisions + +#### Step 1: Consensus Foundation +- **Decision**: Use HashiCorp Raft for distributed consensus +- **Rationale**: Proven, production-ready consensus implementation +- **Impact**: Strong consistency guarantees with leader election and log replication + +#### Step 2: Event-Driven Architecture +- **Decision**: Add fsnotify for real-time file system monitoring +- **Rationale**: Enable automatic replication without manual intervention +- **Impact**: Live synchronization with leader-initiated changes + +#### Step 3: Multi-Directional Design +- **Decision**: Implement follower forwarding with content deduplication +- **Rationale**: Enable truly distributed operations from any node +- **Impact**: Production-ready system supporting concurrent users + +## Component Evolution + +### 1. Storage Manager Evolution +``` +Step 1: Basic Raft coordination + ↓ +Step 2: + File watching integration + ↓ +Step 3: + Forwarding and deduplication +``` + +### 2. Command Structure Evolution +``` +Step 1: Basic operations + ↓ +Step 2: { Op, Path, Data } + ↓ +Step 3: { Op, Path, Data, Hash, NodeID, Sequence } +``` + +### 3. Network Architecture Evolution +``` +Step 1: Raft ports only (8001-8003) + ↓ +Step 2: + Admin ports (9001-9003) + ↓ +Step 3: + Forwarding protocol +``` + +## Performance Evolution + +### Latency Improvements +- **Step 1**: Manual operations (minutes) +- **Step 2**: Live replication (1-4 seconds) +- **Step 3**: Multi-directional replication (1-4 seconds with deduplication) -1. **Step 1 - Basic Raft Replication**: Simple consensus-based replication -2. **Step 2 - Live File Watching**: Added real-time file system monitoring -3. **Step 3 - Multi-Directional Replication**: Full bidirectional file synchronization -4. **Step 4 - Unified CLI**: Consolidated all functionality into single `pickbox` binary +### Throughput Capabilities +- **Step 1**: Single-user, manual +- **Step 2**: Single-user, automatic +- **Step 3**: Multi-user, concurrent -## Current Architecture Details +### Resource Utilization +- **Step 1**: ~50MB per node +- **Step 2**: ~70MB per node +- **Step 3**: ~80MB per node -### Project Structure +## Use Case Evolution + +### Step 1: Development/Testing +- Basic distributed consensus validation +- Manual replication for simple scenarios +- Educational purposes and proof-of-concept + +### Step 2: Single-User Production +- Automated backup and replication +- Development environments with live sync +- Single-point-of-edit scenarios + +### Step 3: Multi-User Production +- Collaborative development environments +- Distributed teams with concurrent editing +- High-availability file systems +- Production applications requiring strong consistency + +## Testing Evolution + +### Test Coverage Growth ``` -cmd/ -└── pickbox/ # Main CLI application - ├── main.go # Entry point and CLI commands - ├── node.go # Node management (start/multi) - ├── multi_replication.go # Multi-directional replication logic - ├── cluster.go # Cluster management commands - └── script.go # Script execution commands +Step 1: Basic cluster formation and consensus + ↓ +Step 2: + Live file monitoring and replication + ↓ +Step 3: + Multi-directional scenarios and deduplication ``` -### Key Components - -#### 1. **Unified CLI Interface** -- **Command Structure**: `pickbox [command] [subcommand] [flags]` -- **Node Commands**: `start` (full-featured), `multi` (multi-directional replication) -- **Cluster Commands**: `status`, `join` for cluster management -- **Script Commands**: `demo-3-nodes`, `cleanup` for automation - -#### 2. **Multi-Directional Replication Engine** -- **Real-time Monitoring**: `fsnotify` for file system events -- **Conflict Resolution**: Content-based deduplication with SHA-256 -- **Consensus Protocol**: Raft for strong consistency -- **Forwarding**: Non-leaders forward changes to leader - -#### 3. **Monitoring & Administration** -- **Metrics Collection**: Performance and health metrics -- **Dashboard**: Web-based cluster visualization -- **Admin Interface**: TCP-based cluster management -- **Structured Logging**: Comprehensive debugging support - -### Port Allocation Schema -- **Raft Communication**: Base port (default 8001+) -- **Admin Interface**: Base port + 1000 (default 9001+) -- **Monitoring**: Base port + 2000 (default 6001+) -- **Dashboard**: Shared port (default 8080) - -## Key Architectural Improvements - -### 1. **Unified Binary** -- **Before**: Multiple separate binaries (`cmd/multi_replication`, `cmd/live_replication`) -- **After**: Single `pickbox` binary with subcommands -- **Benefits**: Simplified deployment, consistent CLI, reduced maintenance - -### 2. **Enhanced Configuration** -- **Validation**: Comprehensive config validation with detailed error messages -- **Flexibility**: Support for various deployment scenarios -- **Defaults**: Sensible defaults for quick setup - -### 3. **Robust Error Handling** -- **Graceful Degradation**: System continues operating despite non-critical failures -- **Detailed Logging**: Structured logging for debugging and monitoring -- **Recovery**: Automatic recovery from transient failures - -### 4. **Comprehensive Testing** -- **Unit Tests**: Full coverage for all components -- **Integration Tests**: End-to-end cluster testing -- **Benchmarks**: Performance testing and optimization -- **Test Utilities**: Reusable testing infrastructure - -## Migration Path - -### For Users -- **Old**: `go run cmd/multi_replication/main.go -node node1 -port 8001` -- **New**: `./bin/pickbox node multi --node-id node1 --port 8001 --bootstrap` - -### For Developers -- **Old**: Separate codebases for different replication modes -- **New**: Unified codebase with mode selection via CLI flags - -### For Deployment -- **Old**: Multiple binaries to deploy and manage -- **New**: Single binary with configuration files - -## Future Enhancements - -### Planned Features -1. **Dynamic Scaling**: Add/remove nodes without restart -2. **Advanced Monitoring**: Prometheus metrics and alerting -3. **Security**: TLS encryption and authentication -4. **Performance**: Optimization for large files and clusters - -### Architecture Considerations -- **Microservices**: Potential split into specialized services -- **Cloud Integration**: Support for cloud storage backends -- **API Gateway**: RESTful API for external integrations - -## Benefits of Current Architecture - -### **Operational Benefits** -- **Simplified Deployment**: Single binary to deploy -- **Consistent Interface**: Unified CLI across all operations -- **Easy Maintenance**: Centralized codebase and documentation - -### **Development Benefits** -- **Code Reuse**: Shared libraries and utilities -- **Testing**: Comprehensive test coverage -- **Documentation**: Unified documentation and examples - -### **User Benefits** -- **Ease of Use**: Intuitive CLI commands -- **Flexibility**: Support for various deployment scenarios -- **Reliability**: Robust error handling and recovery - -This evolutionary approach has resulted in a mature, production-ready distributed storage system that maintains backward compatibility while providing enhanced functionality and ease of use. \ No newline at end of file +### Test Scripts Evolution +- **`test_replication.sh`**: Basic functionality +- **`test_multi_replication.sh`**: Comprehensive multi-directional testing + +## Deployment Evolution + +### Simple Deployment (Step 1) +```bash +# Start nodes manually +./cmd/replication/main -node-id=node1 -port=8001 +./cmd/replication/main -node-id=node2 -port=8002 +./cmd/replication/main -node-id=node3 -port=8003 + +# Manual cluster formation +go run scripts/add_nodes.go +``` + +### Automated Deployment (Step 2) +```bash +# Automated cluster with live replication +# [SCRIPT DELETED] +``` + +### Production Deployment (Step 3) +```bash +# Full multi-directional cluster +./scripts/run_multi_replication.sh + +# Comprehensive testing +./scripts/tests/test_multi_replication.sh +``` + +## Future Evolution Possibilities + +### Potential Step 4: Enhanced Features +- **Conflict Resolution**: Advanced merge strategies +- **Encryption**: At-rest and in-transit encryption +- **Compression**: File compression for storage efficiency +- **Metrics**: Prometheus/Grafana monitoring + +### Potential Step 5: Scale-Out +- **Dynamic Clustering**: Automatic node discovery +- **Partitioning**: Data sharding across clusters +- **Federation**: Multi-cluster coordination +- **Load Balancing**: Intelligent request routing + +## Lessons Learned + +### Architectural Principles +1. **Incremental Enhancement**: Each step builds upon previous foundation +2. **Backwards Compatibility**: Earlier features remain functional +3. **Strong Consistency**: Never compromise on data integrity +4. **Operational Simplicity**: Maintain ease of deployment and testing + +### Technical Insights +1. **Raft Reliability**: HashiCorp Raft provides excellent foundation +2. **Event-Driven Benefits**: File watching enables responsive systems +3. **Deduplication Necessity**: Content hashing prevents infinite loops +4. **State Management**: Per-node state tracking scales better than global + +### Production Readiness +1. **Testing Importance**: Comprehensive test suites catch edge cases +2. **Documentation Value**: Clear architecture docs enable maintenance +3. **Script Automation**: Deployment scripts reduce operational overhead +4. **Performance Monitoring**: Built-in logging enables troubleshooting + +## Conclusion + +The Pickbox distributed storage system represents a successful evolution from basic distributed consensus to a production-ready, multi-directional file replication system. Each architectural phase solved specific limitations while maintaining the strengths of previous implementations. + +The final Step 3 implementation achieves: +- ✅ **Strong Consistency**: Raft consensus guarantees +- ✅ **High Availability**: Fault-tolerant multi-node design +- ✅ **User-Friendly**: Any-node editing capability +- ✅ **Performance**: Sub-second change detection with 1-4 second replication +- ✅ **Reliability**: Zero infinite loops with intelligent deduplication +- ✅ **Scalability**: Concurrent multi-user support + +This evolution demonstrates how complex distributed systems can be built incrementally, with each phase providing immediate value while building toward more advanced capabilities. \ No newline at end of file diff --git a/.cursor/debug/n_node_implementation.md b/.cursor/debug/n_node_implementation.md index 28c2617..2d99b31 100644 --- a/.cursor/debug/n_node_implementation.md +++ b/.cursor/debug/n_node_implementation.md @@ -1,62 +1,114 @@ -# N-Node Implementation Guide +# N-Node Generic Implementation for Pickbox ## Overview -This document describes the implementation of the N-Node cluster functionality in Pickbox, which allows creating and managing clusters of any size (not just 3 nodes) with automatic port assignment and flexible configuration. +The Pickbox distributed storage system has been enhanced to support **N nodes** instead of being hardcoded for exactly 3 nodes. This implementation provides flexible cluster management with configurable parameters for any number of nodes. -## Current Architecture +## Key Changes Made -### Unified CLI Implementation -- **Main CLI**: `cmd/pickbox/main.go` - Unified command-line interface -- **Node Management**: `cmd/pickbox/node.go` - Node lifecycle management -- **Multi-Replication**: `cmd/pickbox/multi_replication.go` - Multi-directional replication logic -- **Cluster Management**: `cmd/pickbox/cluster.go` - Cluster operations +### 1. Generic Cluster Manager (`scripts/cluster_manager.sh`) -### Binary Usage +**New Features:** +- Supports 1 to 20+ nodes (configurable) +- Parameterized port assignments +- Configuration file support +- Dynamic node discovery +- Comprehensive cluster management (start, stop, restart, status, clean, logs) + +**Usage Examples:** ```bash -# Build the unified binary -go build -o bin/pickbox ./cmd/pickbox +# Start 5-node cluster +./scripts/cluster_manager.sh start -n 5 -# Start nodes using the CLI -./bin/pickbox node multi --node-id node1 --port 8001 --bootstrap -./bin/pickbox node multi --node-id node2 --port 8002 --join 127.0.0.1:8001 -./bin/pickbox node multi --node-id node3 --port 8003 --join 127.0.0.1:8001 +# Start 7-node cluster with custom ports +./scripts/cluster_manager.sh start -n 7 -p 9000 -a 10000 + +# Use configuration file +./scripts/cluster_manager.sh start -c examples/cluster-configs/5-node-cluster.conf + +# Check cluster status +./scripts/cluster_manager.sh status -n 5 + +# View logs from all nodes +./scripts/cluster_manager.sh logs -n 5 +``` + +### 2. Enhanced Node Addition (`scripts/add_nodes.go`) + +**Improvements:** +- Generic node count parameter (`-nodes N`) +- Configurable port ranges (`-base-port P`) +- Flexible starting node number (`-start N`) +- Support for remote clusters (`-host H`) + +**Usage Examples:** +```bash +# Add 2 nodes (default: node2, node3) +go run scripts/add_nodes.go + +# Add 5 nodes starting from node2 +go run scripts/add_nodes.go -nodes 5 + +# Add nodes to cluster with custom ports +go run scripts/add_nodes.go -nodes 3 -base-port 9000 -admin-port 10000 + +# Add nodes starting from node4 (for expanding clusters) +go run scripts/add_nodes.go -nodes 2 -start 4 ``` -## Implementation Components +### 3. Flexible Main Application (`cmd/multi_replication/main.go`) -### 1. **Port Allocation System** +**Key Change:** +- Removed hardcoded "node1" bootstrap assumption +- Any node can bootstrap when no join address is specified +- More flexible cluster initialization + +**Before:** ```go -// Port calculation logic in cmd/pickbox/ -func calculatePorts(basePort int) (int, int, int) { - raftPort := basePort - adminPort := basePort + 1000 - monitorPort := basePort + 2000 - return raftPort, adminPort, monitorPort +// Auto-bootstrap if no join address and this is node1 +if cfg.JoinAddr == "" && cfg.NodeID == "node1" { + cfg.BootstrapCluster = true } ``` -### 2. **Configuration Management** +**After:** ```go -// Enhanced configuration in cmd/pickbox/multi_replication.go -type MultiConfig struct { - NodeID string - Port int - AdminPort int - MonitorPort int - DashboardPort int - DataDir string - Join string - Host string - Bootstrap bool +// Auto-bootstrap if no join address is specified +// This allows any node to bootstrap when it's the first in the cluster +if cfg.JoinAddr == "" { + cfg.BootstrapCluster = true } ``` -### 3. **Cluster Management Scripts** -The `scripts/cluster_manager.sh` script provides comprehensive N-node cluster management: +### 4. Generic Test Suite (`scripts/tests/test_n_replication.sh`) + +**Features:** +- Tests any number of nodes +- Configurable timeouts and ports +- Comprehensive validation (file operations, deduplication, consistency) +- Verbose output support +**Usage Examples:** +```bash +# Test 5-node cluster +./scripts/tests/test_n_replication.sh -n 5 + +# Test with custom configuration +./scripts/tests/test_n_replication.sh -n 7 -p 9000 -a 10000 -v + +# Quick test with timeout +./scripts/tests/test_n_replication.sh -n 4 -t 120 +``` + +### 5. Configuration File System + +**Example Configurations:** +- `examples/cluster-configs/5-node-cluster.conf` - Standard 5-node setup +- `examples/cluster-configs/7-node-cluster.conf` - 7-node cluster +- `examples/cluster-configs/10-node-high-ports.conf` - 10-node with high ports + +**Configuration Format:** ```bash -# Configuration structure NODE_COUNT=5 BASE_PORT=8001 ADMIN_BASE_PORT=9001 @@ -64,171 +116,177 @@ MONITOR_BASE_PORT=6001 DASHBOARD_PORT=8080 HOST=127.0.0.1 DATA_DIR=data -BINARY=./bin/pickbox -BINARY_ARGS="node multi" +BINARY=cmd/multi_replication/main.go ``` -## Key Features +## Port Assignment Schema -### 1. **Dynamic Node Count** -- Support for 1 to 20+ nodes -- Automatic port assignment -- Scalable configuration +The new implementation uses a systematic port assignment: -### 2. **Flexible Configuration** -- Configuration files for different scenarios -- Environment-specific settings -- Override capabilities +### Formula +- **Raft Port**: `BASE_PORT + (node_number - 1)` +- **Admin Port**: `ADMIN_BASE_PORT + (node_number - 1)` +- **Monitor Port**: `MONITOR_BASE_PORT + (node_number - 1)` +- **Dashboard Port**: Shared across all nodes -### 3. **Automated Management** -- Cluster lifecycle management -- Health monitoring -- Cleanup utilities +### Example for 5-Node Cluster (BASE_PORT=8001) +``` +node1: Raft=8001, Admin=9001, Monitor=6001 +node2: Raft=8002, Admin=9002, Monitor=6002 +node3: Raft=8003, Admin=9003, Monitor=6003 +node4: Raft=8004, Admin=9004, Monitor=6004 +node5: Raft=8005, Admin=9005, Monitor=6005 +Dashboard: 8080 (shared) +``` -## Usage Examples +## Usage Patterns -### Basic Usage +### Quick Start (5-Node Cluster) ```bash -# Start a 5-node cluster +# Start cluster ./scripts/cluster_manager.sh start -n 5 -# Start with custom ports -./scripts/cluster_manager.sh start -n 7 -p 9000 -a 10000 +# Test replication +echo "Hello from node1!" > data/node1/test.txt +echo "Hello from node3!" > data/node3/test.txt -# Use configuration file -./scripts/cluster_manager.sh start -c examples/cluster-configs/5-node-cluster.conf +# Verify replication +ls data/node*/ +cat data/node*/test.txt ``` -### Configuration Files -Example configuration for different scenarios: - -#### Standard 5-Node Setup +### Production Setup (10-Node Cluster) ```bash -# examples/cluster-configs/5-node-cluster.conf -NODE_COUNT=5 -BASE_PORT=8001 -ADMIN_BASE_PORT=9001 -MONITOR_BASE_PORT=6001 -DASHBOARD_PORT=8080 -HOST=127.0.0.1 -DATA_DIR=data -BINARY=./bin/pickbox -BINARY_ARGS="node multi" +# Use configuration file approach +./scripts/cluster_manager.sh start -c examples/cluster-configs/10-node-high-ports.conf + +# Monitor cluster +./scripts/cluster_manager.sh status -c examples/cluster-configs/10-node-high-ports.conf + +# Run comprehensive tests +./scripts/tests/test_n_replication.sh -n 10 -p 18001 -a 19001 -v ``` -#### High-Port Configuration +### Development Testing ```bash -# examples/cluster-configs/10-node-high-ports.conf -NODE_COUNT=10 -BASE_PORT=18001 -ADMIN_BASE_PORT=19001 -MONITOR_BASE_PORT=16001 -DASHBOARD_PORT=18080 -HOST=127.0.0.1 -DATA_DIR=data -BINARY=./bin/pickbox -BINARY_ARGS="node multi" +# Quick 3-node test (backward compatible) +./scripts/cluster_manager.sh start -n 3 + +# Test different sizes +for nodes in 3 5 7; do + echo "Testing $nodes nodes..." + ./scripts/tests/test_n_replication.sh -n $nodes -t 60 +done ``` -## Advanced Features +## Backward Compatibility -### 1. **Multi-Environment Support** -```bash -# Development cluster -./scripts/cluster_manager.sh start -n 3 -p 8001 --data-dir dev +### Existing Scripts Still Work +All existing 3-node scripts remain functional: +- `scripts/run_multi_replication.sh` - Still works for 3 nodes +- `scripts/tests/test_multi_replication.sh` - Still tests 3 nodes -# Staging cluster -./scripts/cluster_manager.sh start -n 5 -p 12001 --data-dir staging +### Migration Path +1. **Keep using existing scripts** for current workflows +2. **Gradually adopt generic scripts** for new clusters +3. **Use configuration files** for complex setups -# Production cluster -./scripts/cluster_manager.sh start -n 7 -p 18001 --data-dir prod -``` +## Advanced Features -### 2. **Dynamic Node Addition** +### Dynamic Cluster Expansion ```bash # Start with 3 nodes ./scripts/cluster_manager.sh start -n 3 -# Add additional nodes -go run scripts/add_nodes.go -nodes 2 -start 4 +# Later expand to 5 nodes (in separate terminal) +./scripts/cluster_manager.sh start -n 2 -start 4 # Add node4, node5 +go run scripts/add_nodes.go -nodes 2 -start 4 # Add to cluster ``` -### 3. **Testing and Validation** +### Multi-Environment Setup ```bash -# Test N-node cluster -./scripts/tests/test_n_replication.sh -n 5 -v +# Development cluster (low ports) +./scripts/cluster_manager.sh start -n 3 -p 8001 -# Test with custom configuration -./scripts/tests/test_n_replication.sh -n 10 -p 18001 +# Staging cluster (medium ports) +./scripts/cluster_manager.sh start -n 5 -p 12001 --data-dir staging_data + +# Testing cluster (high ports) +./scripts/cluster_manager.sh start -n 7 -p 18001 --data-dir test_data ``` -## Benefits +### Custom Binary Testing +```bash +# Test with different binary +# [DELETED] ./scripts/cluster_manager.sh start -n 4 --binary cmd/multi_replication/main.go -### 1. **Scalability** -- Support for large clusters -- Efficient resource utilization -- Horizontal scaling capabilities +# Test with configuration +echo "# [BINARY DELETED - use cmd/multi_replication/main.go instead]" >> custom.conf +./scripts/cluster_manager.sh start -c custom.conf +``` -### 2. **Flexibility** -- Configurable port ranges -- Multiple deployment scenarios -- Environment-specific settings +## Validation and Testing -### 3. **Automation** -- Automated cluster management -- Simplified deployment -- Comprehensive testing +### Comprehensive Test Coverage +The new test suite validates: +- ✅ **Cluster Formation**: N-node startup and joining +- ✅ **Multi-directional Replication**: Any node → all nodes +- ✅ **Content Consistency**: Files identical across all nodes +- ✅ **Deduplication**: No infinite loops from simultaneous writes +- ✅ **Fault Tolerance**: Graceful handling of node failures +- ✅ **Performance**: Scaling characteristics with node count -### 4. **Reliability** -- Fault tolerance -- Health monitoring -- Graceful degradation +### Test Results Summary +```bash +# Example test output for 5-node cluster +🎉 SUCCESS: All N-node replication tests passed! + +✅ Tested successfully: + • 5-node cluster startup + • Multi-directional file replication + • Content consistency across all nodes + • Deduplication and conflict resolution +``` -## Implementation Notes +## Benefits of N-Node Implementation -### Port Allocation Schema -- **Raft Port**: BASE_PORT + node_number - 1 -- **Admin Port**: ADMIN_BASE_PORT + node_number - 1 -- **Monitor Port**: MONITOR_BASE_PORT + node_number - 1 -- **Dashboard Port**: Shared across all nodes +### 1. **Flexibility** +- Support for any cluster size (1-20+ nodes) +- Configurable port ranges to avoid conflicts +- Environment-specific configurations -### Configuration Management -- Default values for quick setup -- Override capabilities for custom scenarios -- Validation and error handling +### 2. **Scalability** +- Easy horizontal scaling +- Performance testing with different node counts +- Production-ready large clusters -### Testing Strategy -- Unit tests for core functionality -- Integration tests for cluster operations -- Performance benchmarks -- End-to-end validation +### 3. **Development Efficiency** +- Single toolset for all cluster sizes +- Consistent management interface +- Automated testing for various configurations -## Migration from Legacy Implementation +### 4. **Production Readiness** +- Port conflict resolution +- Resource isolation between environments +- Comprehensive monitoring and logging -### Old Structure -``` -cmd/ -├── multi_replication/ -│ └── main.go -└── live_replication/ - └── main.go -``` +## Future Enhancements -### New Structure -``` -cmd/ -└── pickbox/ - ├── main.go - ├── node.go - ├── multi_replication.go - ├── cluster.go - └── script.go -``` +### Potential Improvements +1. **Auto-discovery**: Automatic node detection without manual configuration +2. **Load Balancing**: Intelligent request routing across nodes +3. **Health Monitoring**: Automated failure detection and recovery +4. **Dynamic Reconfiguration**: Runtime cluster resizing +5. **Multi-host Support**: Distributed across multiple machines + +### Configuration Management +1. **Kubernetes Integration**: Helm charts for N-node deployments +2. **Docker Compose**: Multi-container orchestration +3. **Environment Variables**: Cloud-native configuration +4. **Service Discovery**: Integration with Consul/etcd + +## Conclusion -### Migration Steps -1. Replace multiple binaries with single `pickbox` binary -2. Update CLI commands to use new structure -3. Migrate configuration files to new format -4. Update scripts to use new binary and arguments +The N-node implementation transforms Pickbox from a fixed 3-node system into a truly scalable distributed storage solution. With generic tooling, comprehensive testing, and flexible configuration, it's now ready for both development experimentation and production deployment at any scale. -This unified approach provides a more maintainable and user-friendly system while preserving all the functionality of the original multi-directional replication system. \ No newline at end of file +**Key Takeaway**: The same codebase now powers anything from a single-node development setup to a large production cluster, with consistent behavior and management tools across all configurations. \ No newline at end of file diff --git a/.cursor/debug/step2_live_replication.md b/.cursor/debug/step2_live_replication.md deleted file mode 100644 index efd1b65..0000000 --- a/.cursor/debug/step2_live_replication.md +++ /dev/null @@ -1,282 +0,0 @@ -# Step 2: Live File Replication Architecture - -## Overview - -The second implementation builds upon the basic Raft foundation by adding real-time file watching capabilities. This enables automatic replication of file changes from the leader to all followers without manual intervention. - -## Architecture Diagram - -```mermaid -graph TB - subgraph "Step 2: Live File Replication System" - subgraph "Node 1 (Leader)" - N1[Node 1
Port: 8001] - FW1[File Watcher
fsnotify] - FSM1[Raft FSM] - RF1[Raft Instance] - FS1[File System
data/node1/] - ADM1[Admin Server
Port: 9001] - - N1 --> FW1 - N1 --> FSM1 - N1 --> ADM1 - FW1 -->|"File Changes"| FSM1 - FSM1 --> RF1 - FSM1 --> FS1 - FSM1 -.->|"Pause/Resume"| FW1 - end - - subgraph "Node 2 (Follower)" - N2[Node 2
Port: 8002] - FSM2[Raft FSM] - RF2[Raft Instance] - FS2[File System
data/node2/] - ADM2[Admin Server
Port: 9002] - - N2 --> FSM2 - N2 --> ADM2 - FSM2 --> RF2 - FSM2 --> FS2 - end - - subgraph "Node 3 (Follower)" - N3[Node 3
Port: 8003] - FSM3[Raft FSM] - RF3[Raft Instance] - FS3[File System
data/node3/] - ADM3[Admin Server
Port: 9003] - - N3 --> FSM3 - N3 --> ADM3 - FSM3 --> RF3 - FSM3 --> FS3 - end - - subgraph "External Interaction" - USER[User
File Operations] - ADMIN[Admin Tool
Cluster Management] - end - - %% Raft Consensus Communication - RF1 -->|"Log Replication
(Commands)"| RF2 - RF1 -->|"Log Replication
(Commands)"| RF3 - RF2 -.->|"Heartbeats"| RF1 - RF3 -.->|"Heartbeats"| RF1 - - %% Command Flow - FSM1 -->|"WRITE Commands"| RF1 - RF1 -->|"Apply Log"| FSM2 - RF1 -->|"Apply Log"| FSM3 - - %% User Interactions - USER -->|"Create/Edit Files"| FS1 - ADMIN --> ADM1 - ADMIN --> ADM2 - ADMIN --> ADM3 - - %% Real-time Detection - FS1 -.->|"File Events"| FW1 - - %% Automatic Replication - FSM2 -->|"Auto Apply"| FS2 - FSM3 -->|"Auto Apply"| FS3 - - %% Synchronization - FS1 -.->|"Real-time Sync"| FS2 - FS1 -.->|"Real-time Sync"| FS3 - end -``` - -## Architecture Components - -### Core Enhancements - -1. **File Watcher Integration** (`fsnotify`) - - Real-time monitoring of file system changes - - Automatic detection of file creation, modification, deletion - - Integration with Raft consensus for replication - -2. **Raft Finite State Machine (FSM)** - - Custom FSM implementation for file operations - - Handles WRITE and DELETE commands - - Manages replication state and conflict resolution - -3. **Admin Interface** - - TCP server for cluster management - - Commands: ADD_VOTER, cluster status - - Port offset: Base port + 1000 - -### Enhanced Node Architecture - -Each node now includes: -- **File Watcher**: Monitors local file system changes (leader only) -- **Raft FSM**: Processes file operation commands -- **Admin Server**: Management interface -- **Synchronization Logic**: Prevents infinite loops - -### Key Improvements from Step 1 - -#### ✅ New Features -- **Real-time File Watching**: Automatic detection of file changes -- **Automatic Replication**: No manual intervention required -- **Command-based Operations**: Structured file operations through Raft -- **Admin Interface**: TCP-based cluster management -- **Loop Prevention**: Smart pausing during replication - -#### 🔄 Enhanced Components -- **FSM Implementation**: Custom file operation state machine -- **Event-driven Architecture**: File system events trigger replication -- **Improved Logging**: Detailed operation tracking - -## Data Flow - -### Automatic File Replication -``` -1. User creates/modifies file in data/node1/ -2. File Watcher detects change → Generate event -3. Leader FSM creates WRITE command -4. Raft consensus → Replicate command to followers -5. Follower FSMs apply command → Update local files -6. All nodes synchronized automatically -``` - -### Event Processing Pipeline -``` -File Change → fsnotify Event → Command Creation → Raft Log → -Consensus → Apply to FSM → File System Update → Sync Complete -``` - -### Conflict Prevention -``` -1. Before applying command → Pause file watching -2. Apply file operation → Update local file system -3. Resume file watching → Prevent infinite loops -4. Global state tracking → Avoid duplicate operations -``` - -## Implementation Details - -### Code Structure -- **Entry Point**: `cmd/live_replication/main.go` -- **FSM Logic**: Custom Raft FSM implementation -- **File Watching**: `fsnotify` integration -- **Admin Server**: TCP management interface - -### Command Structure -```go -type Command struct { - Op string // "write" or "delete" - Path string // Relative file path - Data []byte // File content -} -``` - -### File Watcher Logic -- **Target**: Leader node only watches files -- **Events**: CREATE, WRITE operations trigger replication -- **Filtering**: Ignores Raft internal files and directories -- **Synchronization**: Global pause mechanism prevents loops - -### Network Architecture -- **Raft Ports**: 8001, 8002, 8003 -- **Admin Ports**: 9001, 9002, 9003 -- **Protocol**: TCP for both Raft and admin - -## Key Features - -### ✅ Implemented -- **Live File Monitoring**: Real-time change detection -- **Automatic Replication**: No manual copying required -- **Strong Consistency**: Raft ensures all nodes stay synchronized -- **Leader-only Writes**: Only leader initiates replication -- **Loop Prevention**: Smart pausing prevents infinite replication - -### ⚠️ Limitations -- **Unidirectional**: Only leader → followers replication -- **Global Pause**: Affects all file watching during replication -- **Leader Dependency**: All changes must originate from leader -- **Single Point of Origin**: Cannot replicate from followers - -## Testing and Verification - -### Test Scripts -- **`scripts/tests/test_live_replication.sh`**: Comprehensive testing -- **`scripts/run_live_replication.sh`**: Interactive demo - -### Test Scenarios -1. **File Creation**: Create file on leader → Verify on all nodes -2. **File Modification**: Modify file on leader → Check replication -3. **Multiple Files**: Create multiple files → Verify count consistency -4. **Content Verification**: Compare file contents across nodes - -### Performance Metrics -- **Detection Latency**: < 100ms file change detection -- **Replication Time**: 1-4 seconds end-to-end -- **Consistency**: 100% content matching across nodes - -## Storage Layout - -``` -data/ -├── node1/ (Leader) -│ ├── raft/ -│ │ ├── logs.dat # Raft log entries -│ │ ├── stable.dat # Raft state -│ │ └── snapshots/ # Raft snapshots -│ ├── welcome.txt # Initial file -│ └── *.txt # User files (watched) -├── node2/ (Follower) -│ ├── raft/ # Raft state (replica) -│ └── *.txt # Replicated files -└── node3/ (Follower) - ├── raft/ # Raft state (replica) - └── *.txt # Replicated files -``` - -## Configuration Parameters - -### File Watching -- **Watch Directory**: `data/nodeX/` -- **Ignored Patterns**: `raft-*`, `*.db`, `snapshots/` -- **Event Types**: CREATE, WRITE -- **Buffer Time**: 100ms pause after operations - -### Raft Configuration -- **Heartbeat Timeout**: 1 second -- **Election Timeout**: 1 second -- **Apply Timeout**: 5 seconds -- **Snapshot Interval**: 8192 log entries - -### Admin Interface -- **Port Range**: 9001-9003 -- **Commands**: ADD_VOTER, cluster status -- **Timeout**: 5 seconds - -## Resource Requirements - -### Per Node -- **Memory**: ~70MB (Raft + file watching) -- **CPU**: Low baseline + spikes during replication -- **Storage**: Raft logs + application data -- **Network**: TCP connections for Raft + admin - -### Cluster Total -- **Monitoring**: 1 active file watcher (leader) -- **Consensus**: 3-node Raft cluster -- **Admin Interfaces**: 3 management endpoints - -## Evolution to Step 3 - -This live replication implementation provides the foundation for multi-directional replication: - -### Current Limitations Addressed in Step 3 -1. **Unidirectional Flow**: Enable any node → all nodes replication -2. **Global Pause**: Implement per-node pause mechanisms -3. **Leader Dependency**: Allow followers to initiate changes -4. **Single Origin**: Support multiple simultaneous change sources - -### Architecture Preparation -- FSM command structure ready for enhancement -- File watching infrastructure established -- Admin interface available for forwarding -- Consensus mechanisms proven stable \ No newline at end of file diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 7e40622..9087758 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -109,8 +109,10 @@ jobs: sudo apt-get update sudo apt-get install -y netcat-openbsd curl - # Build pickbox binary - go build -o ./bin/pickbox ./cmd/pickbox + # Build test binaries + cd cmd/multi_replication + go build -o ../../bin/multi_replication . + cd ../.. # Make scripts executable chmod +x scripts/*.sh @@ -166,7 +168,7 @@ jobs: - name: Run Package Benchmarks run: | - go test -bench=. -benchmem -run=^$ ./pkg/storage ./cmd/pickbox > package-bench-results.txt + go test -bench=. -benchmem -run=^$ ./pkg/storage ./cmd/multi_replication > package-bench-results.txt cat package-bench-results.txt - name: Upload benchmark results @@ -209,12 +211,14 @@ jobs: - name: Download dependencies run: go mod download - - name: Build pickbox binary +# [REMOVED] live_replication binary build step + + - name: Build multi_replication binary env: GOOS: ${{ matrix.goos }} GOARCH: ${{ matrix.goarch }} run: | - go build -v -o bin/pickbox-${{ matrix.goos }}-${{ matrix.goarch }}${{ matrix.goos == 'windows' && '.exe' || '' }} ./cmd/pickbox + go build -v -o bin/multi_replication-${{ matrix.goos }}-${{ matrix.goarch }}${{ matrix.goos == 'windows' && '.exe' || '' }} ./cmd/multi_replication - name: Upload build artifacts uses: actions/upload-artifact@v4 @@ -541,11 +545,11 @@ jobs: - Comprehensive security scan results available in artifacts ### Downloads - - **Linux AMD64**: `pickbox-linux-amd64` - - **Linux ARM64**: `pickbox-linux-arm64` - - **macOS AMD64**: `pickbox-darwin-amd64` - - **macOS ARM64**: `pickbox-darwin-arm64` - - **Windows AMD64**: `pickbox-windows-amd64.exe` + - **Linux AMD64**: `multi_replication-linux-amd64` + - **Linux ARM64**: `multi_replication-linux-arm64` + - **macOS AMD64**: `multi_replication-darwin-amd64` + - **macOS ARM64**: `multi_replication-darwin-arm64` + - **Windows AMD64**: `multi_replication-windows-amd64.exe` ### Documentation - Coverage Report: `coverage.html` diff --git a/.gitignore b/.gitignore index aa7a0c4..0812d03 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,4 @@ config.json .Spotlight-V100 .Trashes ehthumbs.db -Thumbs.db - -./pickbox \ No newline at end of file +Thumbs.db \ No newline at end of file diff --git a/.golangci.yml b/.golangci.yml index a93c13f..b4d7ad1 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -29,7 +29,7 @@ linters-settings: min-occurrences: 2 goimports: - local-prefixes: github.com/addityasingh/pickbox + local-prefixes: github.com/aditya/pickbox unused: check-exported: true diff --git a/Makefile b/Makefile index 2ca31bf..d5f3f2d 100644 --- a/Makefile +++ b/Makefile @@ -9,12 +9,15 @@ help: ## Show this help message @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-15s %s\n", $$1, $$2}' $(MAKEFILE_LIST) # Build targets -.PHONY: build clean install -build: ## Build the main pickbox CLI binary - go build -v -o bin/pickbox ./cmd/pickbox +.PHONY: build build-all clean +build: ## Build the multi-replication binary + go build -v -o bin/multi_replication ./cmd/multi_replication -install: build ## Install pickbox CLI to $GOPATH/bin - cp bin/pickbox $(GOPATH)/bin/pickbox +build-all: ## Build all binaries + mkdir -p bin + go build -v -o bin/replication ./cmd/replication +# [REMOVED] live_replication build command + go build -v -o bin/multi_replication ./cmd/multi_replication clean: ## Clean build artifacts and test data rm -rf bin/ @@ -22,7 +25,7 @@ clean: ## Clean build artifacts and test data rm -rf /tmp/pickbox-* rm -rf /tmp/test-* rm -f coverage.out coverage.html - pkill -f pickbox || true + pkill -f multi_replication || true # Development setup .PHONY: setup install-tools install-pre-commit @@ -113,52 +116,19 @@ test-coverage: ## Run tests with coverage go tool cover -func=coverage.out test-bench: ## Run benchmark tests - go test -bench=. -benchmem ./pkg/storage ./cmd/pickbox + go test -bench=. -benchmem ./pkg/storage ./cmd/multi_replication -# CLI Demo and scripts -.PHONY: demo demo-cli demo-3-nodes demo-multi demo-cleanup -demo: demo-cli ## Run CLI demo (default) +# Demo and scripts +.PHONY: demo demo-multi demo-live demo-basic +demo: demo-multi ## Run multi-replication demo (default) -demo-cli: build ## Run 3-node cluster demo using CLI - ./bin/pickbox script demo-3-nodes - -demo-3-nodes: build ## Run 3-node cluster demo - ./bin/pickbox script demo-3-nodes - -demo-multi: build ## Run multi-directional replication demo using CLI - ./bin/pickbox node multi --node-id multi-demo --port 8010 - -demo-cleanup: ## Clean up demo data - ./bin/pickbox script cleanup || true - -# CLI commands examples -.PHONY: cli-help cli-start-node cli-start-cluster cli-join-cluster cli-status -cli-help: build ## Show CLI help - ./bin/pickbox --help - -cli-start-node: build ## Start a single node (bootstrap) - ./bin/pickbox node start --node-id node1 --port 8001 --bootstrap - -cli-start-cluster: build ## Start a 3-node cluster - @echo "Starting 3-node cluster..." - ./bin/pickbox node start --node-id node1 --port 8001 --bootstrap & - sleep 3 - ./bin/pickbox node start --node-id node2 --port 8002 --join 127.0.0.1:8001 & - ./bin/pickbox node start --node-id node3 --port 8003 --join 127.0.0.1:8001 & - @echo "Cluster started. Use 'make demo-cleanup' to stop." - -cli-join-cluster: build ## Join a node to existing cluster - ./bin/pickbox cluster join --leader 127.0.0.1:8001 --node-id node4 --node-addr 127.0.0.1:8004 - -cli-status: build ## Check cluster status - ./bin/pickbox cluster status --addr 127.0.0.1:9001 - -# Legacy demos (for backward compatibility) -.PHONY: demo-legacy demo-basic -demo-legacy: clean ## Run legacy multi-directional replication demo +demo-multi: clean ## Run multi-directional replication demo ./scripts/run_multi_replication.sh -demo-basic: clean ## Run basic replication demo (legacy) +demo-live: clean ## Run live replication demo +# [REMOVED] ./scripts/run_live_replication.sh + +demo-basic: clean ## Run basic replication demo ./scripts/run_replication.sh # Verification and CI simulation @@ -169,7 +139,7 @@ ci: ## Simulate CI pipeline locally $(MAKE) lint $(MAKE) security $(MAKE) test-coverage - $(MAKE) build + $(MAKE) build-all @echo "✅ CI simulation completed successfully!" pre-commit: ## Run pre-commit hooks manually @@ -186,7 +156,7 @@ verify-all: ## Run comprehensive verification (lint + test + security) .PHONY: docs docs: ## Generate and view documentation godoc -http=:6060 - @echo "Documentation available at http://localhost:6060/pkg/github.com/addityasingh/pickbox/" + @echo "Documentation available at http://localhost:6060/pkg/github.com/aditya/pickbox/" # Git helpers .PHONY: git-hooks diff --git a/README.md b/README.md index dc3eb36..2e96a63 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Pickbox is a distributed storage system implemented in Go that provides file ope ## Multi-Directional Replication Architecture -The current implementation provides advanced multi-directional file replication where any node can initiate changes that automatically propagate to all other nodes while maintaining strong consistency through Raft consensus. +The current implementation (Step 3) provides advanced multi-directional file replication where any node can initiate changes that automatically propagate to all other nodes while maintaining strong consistency through Raft consensus. ```mermaid graph TB @@ -155,43 +155,29 @@ graph TB ``` . -├── cmd/ -│ └── pickbox/ # Main CLI application -│ ├── main.go # Entry point -│ ├── node.go # Node management commands -│ ├── multi_replication.go # Multi-directional replication -│ ├── cluster.go # Cluster management -│ └── script.go # Script execution +├── cmd/ # Application entry points +│ ├── replication/ # Step 1: Basic Raft replication +│ └── multi_replication/ # Multi-directional replication ├── pkg/ -│ ├── storage/ # Storage layer -│ │ ├── manager.go # Storage manager implementation -│ │ ├── raft_manager.go # Raft consensus implementation -│ │ └── *_test.go # Tests -│ ├── replication/ # Replication logic -│ │ ├── fsm.go # Finite state machine -│ │ └── fsm_test.go # Tests -│ ├── watcher/ # File watching -│ │ ├── file_watcher.go # File system monitoring -│ │ ├── state_manager.go # State management -│ │ └── *_test.go # Tests -│ ├── monitoring/ # Monitoring and metrics -│ │ ├── metrics.go # Metrics collection -│ │ ├── dashboard.go # Dashboard UI -│ │ └── *_test.go # Tests -│ └── admin/ # Admin interface -│ ├── server.go # Admin server -│ └── server_test.go # Tests -├── test/ # Integration tests -│ ├── integration_test.go # End-to-end tests -│ ├── n_node_test.go # N-node cluster tests -│ └── *_test.go # Other test files +│ └── storage/ +│ ├── manager.go # Storage manager implementation +│ ├── raft_manager.go # Raft consensus implementation +│ └── raft_test.go # Raft tests ├── scripts/ # Automation scripts -│ ├── cluster_manager.sh # Cluster management -│ ├── demo_n_nodes.sh # N-node demos -│ └── tests/ # Test scripts -├── examples/ # Example configurations -│ └── cluster-configs/ # Cluster configuration files +│ ├── tests/ # Test scripts +│ │ ├── test_replication.sh +│ │ └── test_multi_replication.sh +│ ├── run_replication.sh # Demo scripts +│ ├── run_multi_replication.sh +│ ├── cleanup_replication.sh # Utility scripts +│ └── add_nodes.go ├── .cursor/debug/ # Architecture documentation +│ ├── step1_basic_raft_replication.md + +│ ├── step3_multi_directional_replication.md +│ └── architecture_evolution_overview.md +├── go.mod # Go module definition +├── go.sum # Go module checksums └── README.md # This file ``` @@ -209,32 +195,12 @@ graph TB cd pickbox ``` -2. **Build the application**: - ```bash - make build - # or - go build -o bin/pickbox ./cmd/pickbox - ``` - -3. **Setup development environment** (optional but recommended): +2. **Setup development environment** (optional but recommended): ```bash make setup # Install tools and pre-commit hooks ``` -4. **Start a cluster using the CLI**: - ```bash - # Start 3-node cluster using CLI - ./bin/pickbox node start --node-id node1 --port 8001 --bootstrap & - ./bin/pickbox node start --node-id node2 --port 8002 --join 127.0.0.1:8001 & - ./bin/pickbox node start --node-id node3 --port 8003 --join 127.0.0.1:8001 & - - # Or use multi-directional replication mode - ./bin/pickbox node multi --node-id node1 --port 8001 --bootstrap & - ./bin/pickbox node multi --node-id node2 --port 8002 --join 127.0.0.1:8001 & - ./bin/pickbox node multi --node-id node3 --port 8003 --join 127.0.0.1:8001 & - ``` - -5. **Alternative: Use cluster management scripts**: +3. **Start a cluster (any size)**: ```bash # 3-node cluster (backward compatible) ./scripts/cluster_manager.sh start -n 3 @@ -249,7 +215,7 @@ graph TB ./scripts/cluster_manager.sh start -c examples/cluster-configs/5-node-cluster.conf ``` -6. **Test the system**: +4. **Test the system**: ```bash # Create files on any node - they replicate everywhere! echo "Hello from node1!" > data/node1/test1.txt @@ -260,7 +226,7 @@ graph TB ls data/node*/ ``` -7. **Run comprehensive tests**: +5. **Run comprehensive tests**: ```bash # Test specific cluster size ./scripts/tests/test_n_replication.sh -n 5 @@ -275,38 +241,6 @@ graph TB - **nodeN**: Raft=800N, Admin=900N, Monitor=600N - **Dashboard**: 8080 (shared across all nodes) -## CLI Commands - -The `pickbox` CLI provides comprehensive cluster management: - -### Node Management -```bash -# Start a node -./bin/pickbox node start --node-id node1 --port 8001 --bootstrap - -# Start multi-directional replication -./bin/pickbox node multi --node-id node1 --port 8001 --bootstrap - -# Join existing cluster -./bin/pickbox node start --node-id node2 --port 8002 --join 127.0.0.1:8001 -``` - -### Cluster Management -```bash -# Check cluster status -./bin/pickbox cluster status --addr 127.0.0.1:9001 - -# Join cluster -./bin/pickbox cluster join --leader 127.0.0.1:8001 --node-id node4 --node-addr 127.0.0.1:8004 -``` - -### Script Execution -```bash -# Run predefined scripts -./bin/pickbox script demo-3-nodes -./bin/pickbox script cleanup -``` - ## Cluster Management (N-Node Support) Pickbox now supports **generic N-node clusters** with flexible configuration. You can run anywhere from 1 to 20+ nodes with automatic port assignment and cluster management. @@ -347,8 +281,7 @@ MONITOR_BASE_PORT=6001 DASHBOARD_PORT=8080 HOST=127.0.0.1 DATA_DIR=data -BINARY=./bin/pickbox -BINARY_ARGS="node multi" +BINARY=cmd/multi_replication/main.go ``` ### Advanced Usage @@ -375,7 +308,6 @@ All existing 3-node scripts remain functional: ```bash # Legacy scripts (still work) ./scripts/run_multi_replication.sh # 3-node cluster -./scripts/run_live_replication.sh # Live replication demo ./scripts/tests/test_multi_replication.sh # 3-node tests ``` @@ -423,17 +355,12 @@ find data/ -name "*.txt" -exec echo "=== {} ===" \; -exec cat {} \; echo "STATUS" | nc localhost 9001 # Node 1 admin port echo "STATUS" | nc localhost 9002 # Node 2 admin port echo "STATUS" | nc localhost 9003 # Node 3 admin port - -# Or use the CLI -./bin/pickbox cluster status --addr 127.0.0.1:9001 ``` **Cleanup**: ```bash # Clean up all processes and data ./scripts/cleanup_replication.sh -# or -./bin/pickbox script cleanup ``` ## Implementation Details @@ -465,7 +392,7 @@ The system uses structured logging via `logrus` for better observability. Logs i Pickbox includes a comprehensive test suite covering unit tests, integration tests, and benchmarks. The system provides: -- **Unit Tests**: Storage package, Raft manager, and pickbox CLI components *(active)* +- **Unit Tests**: Storage package, Raft manager, and multi-replication components *(active)* - **Integration Tests**: End-to-end 3-node cluster testing *(currently disabled for CI/CD stability)* - **Benchmark Tests**: Performance testing for critical operations *(active)* - **Test Scripts**: Automated testing for all replication modes *(manual execution only)* @@ -480,13 +407,12 @@ Pickbox includes a comprehensive test suite covering unit tests, integration tes cd test && go test -v . # Run unit tests -go test -v ./pkg/storage ./cmd/pickbox +go test -v ./pkg/storage ./cmd/multi_replication ``` ### Test Scripts - `scripts/tests/test_replication.sh` - Basic Raft replication tests -- `scripts/tests/test_live_replication.sh` - Live file watching tests - `scripts/tests/test_multi_replication.sh` - Multi-directional replication tests **📖 For comprehensive testing documentation, see [`test/README.md`](test/README.md)** @@ -558,7 +484,7 @@ Pickbox uses GitHub Actions for continuous integration and deployment: ### Artifacts Published - **Coverage Reports**: HTML and raw coverage data -- **Binaries**: Cross-platform executables for the pickbox CLI +- **Binaries**: Cross-platform executables for all three modes - **Security Reports**: SARIF format security scan results - **Integration Logs**: Debug logs from failed integration tests @@ -566,3 +492,48 @@ Pickbox uses GitHub Actions for continuous integration and deployment: - **Build Status**: [![Pickbox CI/CD](https://github.com/addityasingh/pickbox/actions/workflows/go.yml/badge.svg)](https://github.com/addityasingh/pickbox/actions/workflows/go.yml) - **Code Coverage**: [![codecov](https://codecov.io/gh/addityasingh/pickbox/branch/main/graph/badge.svg)](https://codecov.io/gh/addityasingh/pickbox) - **Code Quality**: [![Go Report Card](https://goreportcard.com/badge/github.com/addityasingh/pickbox)](https://goreportcard.com/report/github.com/addityasingh/pickbox) + +## Scripts Organization + +``` +scripts/ +├── tests/ # Test scripts +│ ├── README.md +│ ├── test_replication.sh +│ └── test_multi_replication.sh +├── run_replication.sh # Demo scripts +├── run_multi_replication.sh +├── cleanup_replication.sh # Utility scripts +└── add_nodes.go +``` + +## Architecture Documentation + +Comprehensive architecture diagrams and documentation are available in `.cursor/debug/`: + +- **Step 1**: `step1_basic_raft_replication.md` - Basic Raft consensus replication + +- **Step 3**: `step3_multi_directional_replication.md` - Multi-directional replication +- **Overview**: `architecture_evolution_overview.md` - Complete evolution analysis + +Each document includes detailed Mermaid diagrams showing: +- Node architecture and communication patterns +- Data flow and command processing +- Component relationships and dependencies +- Evolution from basic consensus to advanced multi-directional replication + +## Improvements +- [ ] Refactor code to be more readable +- [x] Add tests for golang files +- [x] Refactor test bash scripts from scripts folder +- [x] Generate architecture diagram for each of the 3 versions (replication, multi_replication) +- [x] Set up comprehensive CI/CD pipeline with GitHub Actions +- [x] Add comprehensive linting with pre-commit hooks and unused field detection +- [ ] Stabilize integration tests for reliable CI/CD execution (currently all disabled due to timing/resource issues) +- [ ] Deploy and create client code for this setup to test end-to-end +- [x] Make it a generalized solution for N nodes instead of hardcoded 3 nodes +- [ ] Understand the RaftFSM + +## License + +MIT License diff --git a/RELEASE.md b/RELEASE.md index ba2bce9..d3fdbdb 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -86,7 +86,7 @@ git tag v1.0.0 git push origin v1.0.0 # 5. Monitor the GitHub Actions workflow -Go to: https://github.com/addityasingh/pickbox/actions +# Go to: https://github.com/aditya/pickbox/actions ``` ## Release Notes diff --git a/cmd/pickbox/multi_replication.go b/cmd/multi_replication/main.go similarity index 63% rename from cmd/pickbox/multi_replication.go rename to cmd/multi_replication/main.go index b739908..affacd2 100644 --- a/cmd/pickbox/multi_replication.go +++ b/cmd/multi_replication/main.go @@ -1,25 +1,31 @@ +// Package main implements a multi-directional distributed file replication system. +// This version uses modular components for better maintainability and testing. package main import ( "errors" + "flag" "fmt" + "log" "net" "os" + "os/signal" "path/filepath" "strconv" "strings" + "syscall" "time" - "github.com/addityasingh/pickbox/pkg/admin" - "github.com/addityasingh/pickbox/pkg/monitoring" - "github.com/addityasingh/pickbox/pkg/storage" - "github.com/addityasingh/pickbox/pkg/watcher" + "github.com/aditya/pickbox/pkg/admin" + "github.com/aditya/pickbox/pkg/monitoring" + "github.com/aditya/pickbox/pkg/storage" + "github.com/aditya/pickbox/pkg/watcher" "github.com/hashicorp/raft" "github.com/sirupsen/logrus" ) -// MultiConfig holds configuration for multi-directional replication -type MultiConfig struct { +// Config holds all configuration for the application. +type AppConfig struct { NodeID string Port int AdminPort int @@ -31,8 +37,8 @@ type MultiConfig struct { BootstrapCluster bool } -// validateMultiConfig validates the multi-directional replication configuration. -func validateMultiConfig(cfg MultiConfig) error { +// validateConfig validates the application configuration. +func validateConfig(cfg AppConfig) error { if cfg.DataDir == "" { return errors.New("data directory cannot be empty") } @@ -48,15 +54,12 @@ func validateMultiConfig(cfg MultiConfig) error { if cfg.MonitorPort <= 0 { return errors.New("monitor port must be positive") } - if cfg.DashboardPort <= 0 { - return errors.New("dashboard port must be positive") - } return nil } -// MultiApplication represents the multi-directional replication application -type MultiApplication struct { - config MultiConfig +// Application represents the main application with all components. +type Application struct { + config AppConfig logger *logrus.Logger raftManager *storage.RaftManager stateManager *watcher.DefaultStateManager @@ -66,10 +69,10 @@ type MultiApplication struct { dashboard *monitoring.Dashboard } -// NewMultiApplication creates a new multi-directional replication application instance -func NewMultiApplication(cfg MultiConfig) (*MultiApplication, error) { +// NewApplication creates a new application instance with all components. +func NewApplication(cfg AppConfig) (*Application, error) { // Validate configuration - if err := validateMultiConfig(cfg); err != nil { + if err := validateConfig(cfg); err != nil { return nil, fmt.Errorf("invalid configuration: %w", err) } @@ -90,7 +93,7 @@ func NewMultiApplication(cfg MultiConfig) (*MultiApplication, error) { return nil, fmt.Errorf("creating data directory: %w", err) } - app := &MultiApplication{ + app := &Application{ config: cfg, logger: logger, } @@ -104,7 +107,7 @@ func NewMultiApplication(cfg MultiConfig) (*MultiApplication, error) { } // initializeComponents sets up all application components. -func (app *MultiApplication) initializeComponents() error { +func (app *Application) initializeComponents() error { var err error // Initialize Raft manager @@ -122,6 +125,7 @@ func (app *MultiApplication) initializeComponents() error { app.stateManager = watcher.NewDefaultStateManager() // Access the raft instance through the manager for admin server + // We'll need to add a getter method to RaftManager raftInstance := app.getRaftInstance() // Initialize admin server @@ -141,7 +145,7 @@ func (app *MultiApplication) initializeComponents() error { // Initialize dashboard app.dashboard = monitoring.NewDashboard(app.monitor, app.logger) - // Initialize file watcher + // Initialize file watcher with simplified approach watcherConfig := watcher.Config{ DataDir: app.config.DataDir, NodeID: app.config.NodeID, @@ -151,9 +155,9 @@ func (app *MultiApplication) initializeComponents() error { app.fileWatcher, err = watcher.NewFileWatcher( watcherConfig, - &multiRaftWrapper{app.raftManager}, + &raftWrapper{app.raftManager}, app.stateManager, - &multiForwarderWrapper{app.logger}, + &forwarderWrapper{}, ) if err != nil { return fmt.Errorf("creating file watcher: %w", err) @@ -163,45 +167,35 @@ func (app *MultiApplication) initializeComponents() error { } // getRaftInstance provides access to the underlying raft instance -func (app *MultiApplication) getRaftInstance() *raft.Raft { +func (app *Application) getRaftInstance() *raft.Raft { if app.raftManager == nil { return nil } return app.raftManager.GetRaft() } -// multiRaftWrapper adapts RaftManager to the watcher.RaftApplier interface. -type multiRaftWrapper struct { +// raftWrapper adapts RaftManager to the watcher.RaftApplier interface. +type raftWrapper struct { rm *storage.RaftManager } -func (rw *multiRaftWrapper) Apply(data []byte, timeout time.Duration) raft.ApplyFuture { - if rw.rm == nil { - return nil - } +func (rw *raftWrapper) Apply(data []byte, timeout time.Duration) raft.ApplyFuture { + // Apply the command directly through the Raft instance return rw.rm.GetRaft().Apply(data, timeout) } -func (rw *multiRaftWrapper) State() raft.RaftState { - if rw.rm == nil { - return raft.Shutdown - } +func (rw *raftWrapper) State() raft.RaftState { return rw.rm.State() } -func (rw *multiRaftWrapper) Leader() raft.ServerAddress { - if rw.rm == nil { - return "" - } +func (rw *raftWrapper) Leader() raft.ServerAddress { return rw.rm.Leader() } -// multiForwarderWrapper implements the watcher.LeaderForwarder interface. -type multiForwarderWrapper struct { - logger *logrus.Logger -} +// forwarderWrapper implements the watcher.LeaderForwarder interface. +type forwarderWrapper struct{} -func (fw *multiForwarderWrapper) ForwardToLeader(leaderAddr string, cmd watcher.Command) error { +func (fw *forwarderWrapper) ForwardToLeader(leaderAddr string, cmd watcher.Command) error { adminCmd := admin.Command{ Op: cmd.Op, Path: cmd.Path, @@ -210,25 +204,12 @@ func (fw *multiForwarderWrapper) ForwardToLeader(leaderAddr string, cmd watcher. NodeID: cmd.NodeID, Sequence: cmd.Sequence, } - - // Convert raft address to admin address - adminAddr := deriveMultiAdminAddress(leaderAddr) - - if fw.logger != nil { - fw.logger.WithFields(logrus.Fields{ - "leader_addr": leaderAddr, - "admin_addr": adminAddr, - "operation": cmd.Op, - "path": cmd.Path, - }).Debug("Forwarding command to leader") - } - - return admin.ForwardToLeader(adminAddr, adminCmd) + return admin.ForwardToLeader(leaderAddr, adminCmd) } // Start starts all application components. -func (app *MultiApplication) Start() error { - app.logger.Infof("🚀 Starting Pickbox multi-directional replication node %s", app.config.NodeID) +func (app *Application) Start() error { + app.logger.Infof("🚀 Starting Pickbox node %s", app.config.NodeID) // Start Raft cluster if err := app.startRaftCluster(); err != nil { @@ -255,14 +236,14 @@ func (app *MultiApplication) Start() error { // Wait for leadership and join cluster if needed go app.handleClusterMembership() - app.logger.Infof("✅ Multi-directional replication node %s started successfully", app.config.NodeID) + app.logger.Infof("✅ Node %s started successfully", app.config.NodeID) app.logAccessURLs() return nil } // startRaftCluster initializes the Raft cluster. -func (app *MultiApplication) startRaftCluster() error { +func (app *Application) startRaftCluster() error { if app.config.BootstrapCluster { app.logger.Info("🏗️ Bootstrapping new cluster...") @@ -287,7 +268,7 @@ func (app *MultiApplication) startRaftCluster() error { } // handleClusterMembership manages cluster joining and leadership monitoring. -func (app *MultiApplication) handleClusterMembership() { +func (app *Application) handleClusterMembership() { if app.config.JoinAddr != "" && !app.config.BootstrapCluster { // Wait a bit for bootstrap node to be ready time.Sleep(5 * time.Second) @@ -296,7 +277,7 @@ func (app *MultiApplication) handleClusterMembership() { app.logger.Infof("Requesting to join cluster at %s", app.config.JoinAddr) nodeAddr := fmt.Sprintf("127.0.0.1:%d", app.config.Port) - leaderAdminAddr := deriveMultiAdminAddress(app.config.JoinAddr) + leaderAdminAddr := app.deriveAdminAddress(app.config.JoinAddr) if err := app.requestJoinCluster(leaderAdminAddr, app.config.NodeID, nodeAddr); err != nil { app.logger.WithError(err).Warn("Failed to join cluster via admin interface") @@ -309,8 +290,26 @@ func (app *MultiApplication) handleClusterMembership() { go app.monitorLeadership() } +// deriveAdminAddress converts a Raft address to an admin address. +// Assumes admin port is 1000 higher than raft port. +func (app *Application) deriveAdminAddress(raftAddr string) string { + host, portStr, err := net.SplitHostPort(raftAddr) + if err != nil { + // Fallback to default admin port + return fmt.Sprintf("127.0.0.1:%d", app.config.AdminPort) + } + + port, err := strconv.Atoi(portStr) + if err != nil { + return fmt.Sprintf("127.0.0.1:%d", app.config.AdminPort) + } + + adminPort := port + 1000 // Default admin port offset + return fmt.Sprintf("%s:%d", host, adminPort) +} + // requestJoinCluster sends an ADD_VOTER command to the leader's admin interface. -func (app *MultiApplication) requestJoinCluster(leaderAdminAddr, nodeID, nodeAddr string) error { +func (app *Application) requestJoinCluster(leaderAdminAddr, nodeID, nodeAddr string) error { conn, err := net.DialTimeout("tcp", leaderAdminAddr, 10*time.Second) if err != nil { return fmt.Errorf("connecting to leader admin at %s: %w", leaderAdminAddr, err) @@ -338,7 +337,7 @@ func (app *MultiApplication) requestJoinCluster(leaderAdminAddr, nodeID, nodeAdd } // monitorLeadership monitors Raft leadership changes and adjusts file watching. -func (app *MultiApplication) monitorLeadership() { +func (app *Application) monitorLeadership() { ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() @@ -348,7 +347,7 @@ func (app *MultiApplication) monitorLeadership() { isLeader := app.raftManager.State() == raft.Leader if isLeader && !wasLeader { - app.logger.Infof("👑 %s became leader - multi-directional file watching active", app.config.NodeID) + app.logger.Infof("👑 %s became leader - file watching active", app.config.NodeID) app.monitor.GetMetrics().IncrementFilesReplicated() // Example metric update } else if !isLeader && wasLeader { app.logger.Infof("👥 %s is now a follower", app.config.NodeID) @@ -359,7 +358,7 @@ func (app *MultiApplication) monitorLeadership() { } // logAccessURLs logs the access URLs for the various interfaces. -func (app *MultiApplication) logAccessURLs() { +func (app *Application) logAccessURLs() { app.logger.Info("🌐 Access URLs:") app.logger.Infof(" Admin Interface: http://localhost:%d", app.config.AdminPort) app.logger.Infof(" Monitoring API: http://localhost:%d", app.config.MonitorPort) @@ -370,8 +369,8 @@ func (app *MultiApplication) logAccessURLs() { } // Stop gracefully shuts down all components. -func (app *MultiApplication) Stop() error { - app.logger.Info("🛑 Shutting down multi-directional replication node...") +func (app *Application) Stop() error { + app.logger.Info("🛑 Shutting down Pickbox node...") // Stop file watcher if err := app.fileWatcher.Stop(); err != nil { @@ -383,69 +382,91 @@ func (app *MultiApplication) Stop() error { app.logger.WithError(err).Warn("Error stopping Raft manager") } - app.logger.Info("✅ Multi-directional replication shutdown completed") + app.logger.Info("✅ Shutdown completed") return nil } -// deriveMultiAdminAddress converts a Raft address to an admin address. -// Assumes admin port is 1000 higher than raft port. -func deriveMultiAdminAddress(raftAddr string) string { - host, portStr, err := net.SplitHostPort(raftAddr) - if err != nil { - // Fallback to localhost:9001 if parsing fails - return "127.0.0.1:9001" - } +// parseFlags parses command line flags and returns configuration. +func parseFlags() AppConfig { + var cfg AppConfig - port, err := strconv.Atoi(portStr) - if err != nil { - return "127.0.0.1:9001" + flag.StringVar(&cfg.NodeID, "node", "node1", "Node ID") + flag.IntVar(&cfg.Port, "port", 8001, "Raft port") + flag.IntVar(&cfg.AdminPort, "admin-port", 9001, "Admin server port") + flag.IntVar(&cfg.MonitorPort, "monitor-port", 6001, "Monitoring server port") + flag.IntVar(&cfg.DashboardPort, "dashboard-port", 8080, "Dashboard server port") + flag.StringVar(&cfg.JoinAddr, "join", "", "Address of node to join") + flag.StringVar(&cfg.DataDir, "data-dir", "", "Data directory (default: data/)") + flag.StringVar(&cfg.LogLevel, "log-level", "info", "Log level (debug, info, warn, error)") + flag.BoolVar(&cfg.BootstrapCluster, "bootstrap", false, "Bootstrap new cluster") + + flag.Parse() + + // Set default data directory if not provided + if cfg.DataDir == "" { + cfg.DataDir = filepath.Join("data", cfg.NodeID) } - adminPort := port + 1000 // Default admin port offset - return fmt.Sprintf("%s:%d", host, adminPort) + // Only bootstrap if explicitly requested + // This prevents multiple nodes from trying to bootstrap simultaneously + // The cluster manager should explicitly set -bootstrap for the first node + + return cfg } -// runMultiReplication runs the multi-directional replication with the given parameters. -func runMultiReplication(nodeID string, port int, join string, dataDir string, logger *logrus.Logger) error { - // Create configuration - cfg := MultiConfig{ - NodeID: nodeID, - Port: port, - AdminPort: port + 1000, - MonitorPort: port + 2000, - DashboardPort: port + 3000, - JoinAddr: join, - DataDir: dataDir, - LogLevel: "info", - BootstrapCluster: join == "", // Bootstrap if not joining - } +// setupSignalHandling sets up graceful shutdown on SIGINT/SIGTERM. +func setupSignalHandling(app *Application) { + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt, syscall.SIGTERM) + + go func() { + <-c + app.logger.Info("🔔 Received shutdown signal") + if err := app.Stop(); err != nil { + app.logger.WithError(err).Error("Error during shutdown") + os.Exit(1) + } + os.Exit(0) + }() +} + +func main() { + // Parse configuration + config := parseFlags() // Create application - app, err := NewMultiApplication(cfg) + app, err := NewApplication(config) if err != nil { - return fmt.Errorf("creating multi-directional replication application: %w", err) + log.Fatalf("Failed to create application: %v", err) } + // Setup signal handling + setupSignalHandling(app) + // Start application if err := app.Start(); err != nil { - return fmt.Errorf("starting multi-directional replication application: %w", err) + log.Fatalf("Failed to start application: %v", err) } - // Create welcome file for bootstrap node - if cfg.BootstrapCluster { + // Create a welcome file for testing (only for bootstrap node) + if config.BootstrapCluster { go func() { time.Sleep(10 * time.Second) // Wait for cluster to be ready - createMultiWelcomeFile(cfg.DataDir, cfg.NodeID, logger) + createWelcomeFile(config.DataDir, config.NodeID, app.logger) }() } - return nil + // Keep running + app.logger.Info("🟢 Node is running! Try editing files in the data directory.") + app.logger.Info("🛑 Press Ctrl+C to stop") + + select {} // Block forever } -// createMultiWelcomeFile creates a test file for demonstration. -func createMultiWelcomeFile(dataDir, nodeID string, logger *logrus.Logger) { +// createWelcomeFile creates a test file for demonstration. +func createWelcomeFile(dataDir, nodeID string, logger *logrus.Logger) { welcomeFile := filepath.Join(dataDir, "welcome.txt") - welcomeContent := fmt.Sprintf(`Welcome to Pickbox Multi-Directional Distributed Storage! + welcomeContent := fmt.Sprintf(`Welcome to Pickbox Distributed Storage! This file was created by %s at %s @@ -464,7 +485,7 @@ Happy distributed computing! 🎉 `, nodeID, time.Now().Format(time.RFC3339)) if err := os.WriteFile(welcomeFile, []byte(welcomeContent), 0644); err == nil { - logger.Info("📝 Created welcome.txt - try editing it to see multi-directional replication in action!") + logger.Info("📝 Created welcome.txt - try editing it to see replication in action!") } else { logger.WithError(err).Warn("Failed to create welcome file") } diff --git a/cmd/multi_replication/main_test.go b/cmd/multi_replication/main_test.go new file mode 100644 index 0000000..91e04c1 --- /dev/null +++ b/cmd/multi_replication/main_test.go @@ -0,0 +1,754 @@ +package main + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/hashicorp/raft" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" +) + +// Test for the refactored AppConfig validation +func TestAppConfig_Validation(t *testing.T) { + tests := []struct { + name string + config AppConfig + wantErr bool + }{ + { + name: "valid config", + config: AppConfig{ + DataDir: "/tmp/test", + NodeID: "node1", + Port: 8000, + AdminPort: 9000, + MonitorPort: 8080, + LogLevel: "info", + }, + wantErr: false, + }, + { + name: "invalid config - empty data dir", + config: AppConfig{ + DataDir: "", + NodeID: "node1", + Port: 8000, + AdminPort: 9000, + MonitorPort: 8080, + LogLevel: "info", + }, + wantErr: true, + }, + { + name: "invalid config - empty node ID", + config: AppConfig{ + DataDir: "/tmp/test", + NodeID: "", + Port: 8000, + AdminPort: 9000, + MonitorPort: 8080, + LogLevel: "info", + }, + wantErr: true, + }, + { + name: "invalid config - zero port", + config: AppConfig{ + DataDir: "/tmp/test", + NodeID: "node1", + Port: 0, + AdminPort: 9000, + MonitorPort: 8080, + LogLevel: "info", + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateConfig(tt.config) + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + }) + } +} + +// Test for the refactored Application creation +func TestNewApplication(t *testing.T) { + // tempDir := t.TempDir() + + tests := []struct { + name string + config AppConfig + wantErr bool + }{ + // { + // name: "valid application creation", + // config: AppConfig{ + // DataDir: tempDir, + // NodeID: "test-node", + // Port: 8000, + // AdminPort: 9000, + // MonitorPort: 8080, + // LogLevel: "info", + // }, + // wantErr: false, + // }, + { + name: "invalid config should fail", + config: AppConfig{ + DataDir: "", + NodeID: "test-node", + Port: 8000, + AdminPort: 9000, + MonitorPort: 8080, + LogLevel: "info", + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + app, err := NewApplication(tt.config) + if tt.wantErr { + assert.Error(t, err) + assert.Nil(t, app) + } else { + assert.NoError(t, err) + assert.NotNil(t, app) + assert.Equal(t, tt.config.NodeID, app.config.NodeID) + assert.Equal(t, tt.config.DataDir, app.config.DataDir) + } + }) + } +} + +// Legacy tests for backward compatibility with original implementation +// These tests are kept for comprehensive coverage of the original functionality + +// Command represents a file operation with enhanced metadata for deduplication. +// This type is kept for backward compatibility with legacy tests +type Command struct { + Op string `json:"op"` // Operation type: "write" or "delete" + Path string `json:"path"` // Relative file path + Data []byte `json:"data"` // File content (for write operations) + Hash string `json:"hash"` // SHA-256 content hash for deduplication + NodeID string `json:"node_id"` // Originating node ID + Sequence int64 `json:"sequence"` // Sequence number for ordering +} + +const ( + // Operations + opWrite = "write" + opDelete = "delete" +) + +func TestCommand_JSONSerialization(t *testing.T) { + tests := []struct { + name string + cmd Command + }{ + { + name: "write_command", + cmd: Command{ + Op: opWrite, + Path: "test/file.txt", + Data: []byte("test content"), + Hash: "abc123", + NodeID: "node1", + Sequence: 1, + }, + }, + { + name: "delete_command", + cmd: Command{ + Op: opDelete, + Path: "test/file.txt", + NodeID: "node2", + Sequence: 2, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Test marshaling + data, err := json.Marshal(tt.cmd) + assert.NoError(t, err) + assert.NotEmpty(t, data) + + // Test unmarshaling + var unmarshaled Command + err = json.Unmarshal(data, &unmarshaled) + assert.NoError(t, err) + assert.Equal(t, tt.cmd.Op, unmarshaled.Op) + assert.Equal(t, tt.cmd.Path, unmarshaled.Path) + assert.Equal(t, tt.cmd.Data, unmarshaled.Data) + assert.Equal(t, tt.cmd.Hash, unmarshaled.Hash) + assert.Equal(t, tt.cmd.NodeID, unmarshaled.NodeID) + assert.Equal(t, tt.cmd.Sequence, unmarshaled.Sequence) + }) + } +} + +// hashContent computes the SHA-256 hash of data for backward compatibility +func hashContent(data []byte) string { + hash := sha256.Sum256(data) + return hex.EncodeToString(hash[:]) +} + +func TestHashContent(t *testing.T) { + tests := []struct { + name string + data []byte + expected string + }{ + { + name: "empty_data", + data: []byte{}, + expected: "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + }, + { + name: "hello_world", + data: []byte("hello world"), + expected: "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9", + }, + { + name: "test_content", + data: []byte("test content for hashing"), + expected: computeExpectedHash([]byte("test content for hashing")), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := hashContent(tt.data) + assert.Equal(t, tt.expected, result) + assert.Len(t, result, 64) // SHA-256 produces 64-character hex string + }) + } +} + +// Legacy FSM tests for backward compatibility +// FileState tracks file metadata to prevent unnecessary operations. +type FileState struct { + Hash string + LastModified time.Time + Size int64 +} + +// ReplicationFSM implements the Raft finite state machine with conflict resolution. +// This is kept for backward compatibility with legacy tests +type ReplicationFSM struct { + dataDir string + nodeID string + fileStates map[string]*FileState + watchingPaused bool + lastSequence int64 + logger *logrus.Logger +} + +func NewReplicationFSM(dataDir, nodeID string, logger *logrus.Logger) *ReplicationFSM { + return &ReplicationFSM{ + dataDir: dataDir, + nodeID: nodeID, + fileStates: make(map[string]*FileState), + logger: logger, + } +} + +func (fsm *ReplicationFSM) getNextSequence() int64 { + fsm.lastSequence++ + return fsm.lastSequence +} + +func (fsm *ReplicationFSM) isWatchingPaused() bool { + return fsm.watchingPaused +} + +func (fsm *ReplicationFSM) pauseWatching() { + fsm.watchingPaused = true +} + +func (fsm *ReplicationFSM) resumeWatching() { + fsm.watchingPaused = false +} + +func (fsm *ReplicationFSM) fileHasContent(path string, expectedData []byte) bool { + state, exists := fsm.fileStates[path] + if !exists { + return false + } + expectedHash := hashContent(expectedData) + return state.Hash == expectedHash +} + +func (fsm *ReplicationFSM) updateFileState(path string, data []byte) { + fsm.fileStates[path] = &FileState{ + Hash: hashContent(data), + LastModified: time.Now(), + Size: int64(len(data)), + } +} + +func (fsm *ReplicationFSM) removeFileState(path string) { + delete(fsm.fileStates, path) +} + +func (fsm *ReplicationFSM) Apply(log *raft.Log) interface{} { + var cmd Command + if err := json.Unmarshal(log.Data, &cmd); err != nil { + return fmt.Errorf("unmarshaling command: %w", err) + } + + // Skip if this command originated from the current node and content matches + if cmd.NodeID == fsm.nodeID && fsm.fileHasContent(cmd.Path, cmd.Data) { + return nil // Avoid infinite loops + } + + // Temporarily disable file watching during application + fsm.pauseWatching() + defer fsm.resumeWatching() + + switch cmd.Op { + case opWrite: + return fsm.applyWrite(cmd) + case opDelete: + return fsm.applyDelete(cmd) + default: + return fmt.Errorf("unknown operation: %q", cmd.Op) + } +} + +func (fsm *ReplicationFSM) applyWrite(cmd Command) error { + filePath := filepath.Join(fsm.dataDir, cmd.Path) + + // Check if content already matches to avoid unnecessary writes + if fsm.fileHasContent(cmd.Path, cmd.Data) { + return nil + } + + if err := os.MkdirAll(filepath.Dir(filePath), 0755); err != nil { + return fmt.Errorf("creating directory for %q: %w", cmd.Path, err) + } + + if err := os.WriteFile(filePath, cmd.Data, 0644); err != nil { + return fmt.Errorf("writing file %q: %w", cmd.Path, err) + } + + fsm.updateFileState(cmd.Path, cmd.Data) + return nil +} + +func (fsm *ReplicationFSM) applyDelete(cmd Command) error { + filePath := filepath.Join(fsm.dataDir, cmd.Path) + + if err := os.Remove(filePath); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("deleting file %q: %w", cmd.Path, err) + } + + fsm.removeFileState(cmd.Path) + return nil +} + +func (fsm *ReplicationFSM) Snapshot() (raft.FSMSnapshot, error) { + return &Snapshot{dataDir: fsm.dataDir}, nil +} + +func (fsm *ReplicationFSM) Restore(rc io.ReadCloser) error { + defer rc.Close() + return nil +} + +// Snapshot implements raft.FSMSnapshot for state persistence. +type Snapshot struct { + dataDir string +} + +func (s *Snapshot) Persist(sink raft.SnapshotSink) error { + defer sink.Close() + + if _, err := sink.Write([]byte("snapshot")); err != nil { + sink.Cancel() + return fmt.Errorf("writing snapshot: %w", err) + } + + return nil +} + +func (s *Snapshot) Release() { + // No resources to clean up +} + +func TestReplicationFSM_NewReplicationFSM(t *testing.T) { + dataDir := "/tmp/test-fsm" + nodeID := "test-node" + logger := logrus.New() + + fsm := NewReplicationFSM(dataDir, nodeID, logger) + + assert.NotNil(t, fsm) + assert.Equal(t, dataDir, fsm.dataDir) + assert.Equal(t, nodeID, fsm.nodeID) + assert.NotNil(t, fsm.fileStates) + assert.Equal(t, logger, fsm.logger) + assert.False(t, fsm.watchingPaused) + assert.Equal(t, int64(0), fsm.lastSequence) +} + +func TestReplicationFSM_SequenceGeneration(t *testing.T) { + fsm := NewReplicationFSM("/tmp/test", "test-node", logrus.New()) + + // Test sequence generation + seq1 := fsm.getNextSequence() + seq2 := fsm.getNextSequence() + seq3 := fsm.getNextSequence() + + assert.Equal(t, int64(1), seq1) + assert.Equal(t, int64(2), seq2) + assert.Equal(t, int64(3), seq3) +} + +func TestReplicationFSM_WatchingControls(t *testing.T) { + fsm := NewReplicationFSM("/tmp/test", "test-node", logrus.New()) + + // Initial state + assert.False(t, fsm.isWatchingPaused()) + + // Pause watching + fsm.pauseWatching() + assert.True(t, fsm.isWatchingPaused()) + + // Resume watching + fsm.resumeWatching() + assert.False(t, fsm.isWatchingPaused()) +} + +func TestReplicationFSM_FileStateManagement(t *testing.T) { + fsm := NewReplicationFSM("/tmp/test", "test-node", logrus.New()) + + path := "test/file.txt" + data := []byte("test content") + + // Initially no file state + assert.False(t, fsm.fileHasContent(path, data)) + + // Update file state + fsm.updateFileState(path, data) + + // Should now have content + assert.True(t, fsm.fileHasContent(path, data)) + + // Different content should return false + differentData := []byte("different content") + assert.False(t, fsm.fileHasContent(path, differentData)) + + // Remove file state + fsm.removeFileState(path) + assert.False(t, fsm.fileHasContent(path, data)) +} + +func TestReplicationFSM_Apply_WriteCommand(t *testing.T) { + // Note: This test would require a real filesystem, so we test the logic parts + fsm := NewReplicationFSM("/tmp/test-apply", "test-node", logrus.New()) + + cmd := Command{ + Op: opWrite, + Path: "test.txt", + Data: []byte("test content"), + Hash: hashContent([]byte("test content")), + NodeID: "other-node", // Different node to avoid skip logic + Sequence: 1, + } + + // Create a mock log entry + cmdData, err := json.Marshal(cmd) + assert.NoError(t, err) + + log := &raft.Log{ + Data: cmdData, + } + + // This would fail in real test due to filesystem access, but tests the unmarshaling + result := fsm.Apply(log) + + // Check if it's an error related to filesystem (expected in test environment) + if result != nil { + err, ok := result.(error) + if ok { + // In test environment, expect filesystem-related errors + assert.Contains(t, err.Error(), "creating directory") + } + } +} + +func TestReplicationFSM_Apply_InvalidJSON(t *testing.T) { + fsm := NewReplicationFSM("/tmp/test", "test-node", logrus.New()) + + log := &raft.Log{ + Data: []byte("invalid json"), + } + + result := fsm.Apply(log) + assert.NotNil(t, result) + + err, ok := result.(error) + assert.True(t, ok) + assert.Contains(t, err.Error(), "unmarshaling command") +} + +func TestReplicationFSM_Apply_UnknownOperation(t *testing.T) { + fsm := NewReplicationFSM("/tmp/test", "test-node", logrus.New()) + + cmd := Command{ + Op: "unknown", + Path: "test.txt", + NodeID: "other-node", + } + + cmdData, err := json.Marshal(cmd) + assert.NoError(t, err) + + log := &raft.Log{ + Data: cmdData, + } + + result := fsm.Apply(log) + assert.NotNil(t, result) + + err, ok := result.(error) + assert.True(t, ok) + assert.Contains(t, err.Error(), "unknown operation") +} + +func TestReplicationFSM_Apply_SkipSameNode(t *testing.T) { + fsm := NewReplicationFSM("/tmp/test", "test-node", logrus.New()) + + // Pre-populate file state to trigger skip logic + path := "test.txt" + data := []byte("test content") + fsm.updateFileState(path, data) + + cmd := Command{ + Op: opWrite, + Path: path, + Data: data, + NodeID: "test-node", // Same as FSM's nodeID + } + + cmdData, err := json.Marshal(cmd) + assert.NoError(t, err) + + log := &raft.Log{ + Data: cmdData, + } + + result := fsm.Apply(log) + assert.Nil(t, result) // Should be nil when skipped +} + +func TestReplicationFSM_Snapshot(t *testing.T) { + fsm := NewReplicationFSM("/tmp/test-snapshot", "test-node", logrus.New()) + + snapshot, err := fsm.Snapshot() + assert.NoError(t, err) + assert.NotNil(t, snapshot) + + // Verify snapshot type + s, ok := snapshot.(*Snapshot) + assert.True(t, ok) + assert.Equal(t, fsm.dataDir, s.dataDir) +} + +func TestSnapshot_Persist(t *testing.T) { + snapshot := &Snapshot{dataDir: "/tmp/test"} + + // Create a mock sink + mockSink := &mockSnapshotSink{} + + err := snapshot.Persist(mockSink) + assert.NoError(t, err) + assert.True(t, mockSink.closed) + assert.Equal(t, []byte("snapshot"), mockSink.data) +} + +func TestSnapshot_Release(t *testing.T) { + snapshot := &Snapshot{dataDir: "/tmp/test"} + + // Should not panic + assert.NotPanics(t, func() { + snapshot.Release() + }) +} + +// isRaftFile checks if a file is related to Raft internals. +func isRaftFile(filename string) bool { + base := filepath.Base(filename) + return strings.HasPrefix(base, "raft-") || + strings.HasSuffix(base, ".db") || + base == "snapshots" || + strings.Contains(filename, "snapshots") +} + +func TestIsRaftFile(t *testing.T) { + tests := []struct { + filename string + expected bool + }{ + {"raft-log.db", true}, + {"raft-stable.db", true}, + {"snapshots", true}, + {"data/node1/snapshots/1-2-123456.tmp", true}, + {"regular-file.txt", false}, + {"data.db", true}, // ends with .db + {"normal.txt", false}, + {"prefix-raft-something", false}, // doesn't start with "raft-" + } + + for _, tt := range tests { + t.Run(tt.filename, func(t *testing.T) { + result := isRaftFile(tt.filename) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestParseFlags(t *testing.T) { + // Test the logic that parseFlags implements + cfg := AppConfig{ + NodeID: "test-node", + Port: 8001, + JoinAddr: "", + } + + // Derive other fields as parseFlags would + if cfg.DataDir == "" { + cfg.DataDir = filepath.Join("data", cfg.NodeID) + } + cfg.AdminPort = 9001 + cfg.MonitorPort = 6001 + cfg.DashboardPort = 8080 + cfg.BootstrapCluster = cfg.JoinAddr == "" + + assert.Equal(t, "data/test-node", cfg.DataDir) + assert.Equal(t, 9001, cfg.AdminPort) + assert.Equal(t, 6001, cfg.MonitorPort) + assert.Equal(t, 8080, cfg.DashboardPort) + assert.True(t, cfg.BootstrapCluster) + + // Test with join address + cfg.JoinAddr = "127.0.0.1:8002" + cfg.BootstrapCluster = cfg.JoinAddr == "" + assert.False(t, cfg.BootstrapCluster) +} + +// Helper functions and mocks + +func computeExpectedHash(data []byte) string { + hash := sha256.Sum256(data) + return hex.EncodeToString(hash[:]) +} + +type mockSnapshotSink struct { + data []byte + closed bool +} + +func (m *mockSnapshotSink) Write(p []byte) (n int, err error) { + m.data = append(m.data, p...) + return len(p), nil +} + +func (m *mockSnapshotSink) Close() error { + m.closed = true + return nil +} + +func (m *mockSnapshotSink) ID() string { + return "mock-snapshot" +} + +func (m *mockSnapshotSink) Cancel() error { + return nil +} + +// Benchmark tests for performance verification + +func BenchmarkHashContent(b *testing.B) { + data := []byte("benchmark data for hashing performance testing with longer content") + + b.ResetTimer() + for i := 0; i < b.N; i++ { + hashContent(data) + } +} + +func BenchmarkReplicationFSM_FileStateManagement(b *testing.B) { + fsm := NewReplicationFSM("/tmp/bench", "bench-node", logrus.New()) + data := []byte("benchmark data") + + b.ResetTimer() + for i := 0; i < b.N; i++ { + path := "bench-file.txt" + fsm.updateFileState(path, data) + fsm.fileHasContent(path, data) + } +} + +func BenchmarkReplicationFSM_SequenceGeneration(b *testing.B) { + fsm := NewReplicationFSM("/tmp/bench", "bench-node", logrus.New()) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + fsm.getNextSequence() + } +} + +func BenchmarkCommand_JSONMarshal(b *testing.B) { + cmd := Command{ + Op: opWrite, + Path: "benchmark/file.txt", + Data: []byte("benchmark data for JSON marshaling performance"), + Hash: hashContent([]byte("benchmark data")), + NodeID: "bench-node", + Sequence: 1, + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + json.Marshal(cmd) + } +} + +func BenchmarkCommand_JSONUnmarshal(b *testing.B) { + cmd := Command{ + Op: opWrite, + Path: "benchmark/file.txt", + Data: []byte("benchmark data for JSON unmarshaling performance"), + Hash: hashContent([]byte("benchmark data")), + NodeID: "bench-node", + Sequence: 1, + } + + data, _ := json.Marshal(cmd) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + var unmarshaled Command + json.Unmarshal(data, &unmarshaled) + } +} diff --git a/cmd/pickbox/cluster.go b/cmd/pickbox/cluster.go deleted file mode 100644 index cbfce9a..0000000 --- a/cmd/pickbox/cluster.go +++ /dev/null @@ -1,168 +0,0 @@ -package main - -import ( - "fmt" - "net" - "strings" - "sync" - "time" - - "github.com/spf13/cobra" -) - -var clusterCmd = &cobra.Command{ - Use: "cluster", - Short: "Cluster management commands", - Long: `Commands for managing Pickbox clusters including joining nodes and cluster operations`, -} - -var clusterJoinCmd = &cobra.Command{ - Use: "join", - Short: "Join a node to an existing cluster", - Long: `Join a node to an existing Pickbox cluster by specifying the leader address`, - RunE: runClusterJoin, -} - -var clusterStatusCmd = &cobra.Command{ - Use: "status", - Short: "Check cluster status", - Long: `Check the status of a Pickbox cluster`, - RunE: runClusterStatus, -} - -// Mutex to protect global variables from concurrent access -var globalVarsMutex sync.RWMutex - -// Cluster join command flags -var ( - leaderAddr string - joinNodeID string - joinNodeAddr string -) - -// Cluster status command flags -var ( - statusAddr string -) - -func init() { - rootCmd.AddCommand(clusterCmd) - clusterCmd.AddCommand(clusterJoinCmd) - clusterCmd.AddCommand(clusterStatusCmd) - - // Cluster join command flags - clusterJoinCmd.Flags().StringVarP(&leaderAddr, "leader", "L", "", "Leader address (required)") - clusterJoinCmd.Flags().StringVarP(&joinNodeID, "node-id", "n", "", "Node ID to join (required)") - clusterJoinCmd.Flags().StringVarP(&joinNodeAddr, "node-addr", "a", "", "Node address (required)") - clusterJoinCmd.MarkFlagRequired("leader") - clusterJoinCmd.MarkFlagRequired("node-id") - clusterJoinCmd.MarkFlagRequired("node-addr") - - // Cluster status command flags - clusterStatusCmd.Flags().StringVarP(&statusAddr, "addr", "a", "127.0.0.1:9001", "Admin address to check status") -} - -func runClusterJoin(cmd *cobra.Command, args []string) error { - // Validate cmd parameter - if cmd == nil { - return fmt.Errorf("command is nil") - } - - // Thread-safe access to global variables - globalVarsMutex.RLock() - leader := leaderAddr - nodeID := joinNodeID - nodeAddr := joinNodeAddr - globalVarsMutex.RUnlock() - - // Validate required global variables are set - if leader == "" { - return fmt.Errorf("leader address is required") - } - if nodeID == "" { - return fmt.Errorf("node ID is required") - } - if nodeAddr == "" { - return fmt.Errorf("node address is required") - } - - // Derive admin address from leader address - adminAddr := deriveAdminAddr(leader) - - fmt.Printf("Attempting to join node %s (%s) to cluster via %s...\n", nodeID, nodeAddr, adminAddr) - - // Use the admin API to join the cluster - conn, err := net.DialTimeout("tcp", adminAddr, 5*time.Second) - if err != nil { - return fmt.Errorf("connecting to admin server: %w", err) - } - defer conn.Close() - - message := fmt.Sprintf("ADD_VOTER %s %s", nodeID, nodeAddr) - if _, err := conn.Write([]byte(message)); err != nil { - return fmt.Errorf("sending join request: %w", err) - } - - // Read response - buffer := make([]byte, 1024) - n, err := conn.Read(buffer) - if err != nil { - return fmt.Errorf("reading response: %w", err) - } - - response := strings.TrimSpace(string(buffer[:n])) - if response != "OK" { - return fmt.Errorf("join request failed: %s", response) - } - - fmt.Printf("✅ Successfully joined node %s to cluster\n", nodeID) - return nil -} - -func runClusterStatus(cmd *cobra.Command, args []string) error { - // Validate cmd parameter - if cmd == nil { - return fmt.Errorf("command is nil") - } - - // Thread-safe access to global variables - globalVarsMutex.RLock() - statusAddress := statusAddr - globalVarsMutex.RUnlock() - - // Validate required global variable is set - if statusAddress == "" { - return fmt.Errorf("status address is required") - } - - // This is a simple implementation - in a real system you'd query more cluster info - conn, err := net.DialTimeout("tcp", statusAddress, 2*time.Second) - if err != nil { - fmt.Printf("❌ Cannot connect to admin server at %s\n", statusAddress) - return fmt.Errorf("connecting to admin server: %w", err) - } - defer conn.Close() - - fmt.Printf("✅ Admin server is reachable at %s\n", statusAddress) - fmt.Printf("🔍 For detailed cluster status, check the monitoring dashboard\n") - return nil -} - -func deriveAdminAddr(raftAddr string) string { - // Handle empty or invalid input - if raftAddr == "" { - return "127.0.0.1:9001" // Default admin port - } - - parts := strings.Split(raftAddr, ":") - if len(parts) != 2 || parts[0] == "" { - return "127.0.0.1:9001" // Default admin port - } - - // Convert raft port to admin port (typically raft_port + 1000) - host := strings.TrimSpace(parts[0]) - if host == "" { - host = "127.0.0.1" - } - return fmt.Sprintf("%s:9001", host) // Default admin port -} diff --git a/cmd/pickbox/cluster_test.go b/cmd/pickbox/cluster_test.go deleted file mode 100644 index 276c6ff..0000000 --- a/cmd/pickbox/cluster_test.go +++ /dev/null @@ -1,619 +0,0 @@ -package main - -import ( - "net" - "strings" - "testing" - "time" - - "github.com/spf13/cobra" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestClusterCommand(t *testing.T) { - tests := []struct { - name string - expectedUse string - expectedShort string - }{ - { - name: "cluster command properties", - expectedUse: "cluster", - expectedShort: "Cluster management commands", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expectedUse, clusterCmd.Use) - assert.Equal(t, tt.expectedShort, clusterCmd.Short) - assert.NotEmpty(t, clusterCmd.Long) - }) - } -} - -func TestClusterJoinCommand(t *testing.T) { - tests := []struct { - name string - expectedUse string - expectedShort string - }{ - { - name: "cluster join command properties", - expectedUse: "join", - expectedShort: "Join a node to an existing cluster", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expectedUse, clusterJoinCmd.Use) - assert.Equal(t, tt.expectedShort, clusterJoinCmd.Short) - assert.NotEmpty(t, clusterJoinCmd.Long) - assert.NotNil(t, clusterJoinCmd.RunE) - }) - } -} - -func TestClusterStatusCommand(t *testing.T) { - tests := []struct { - name string - expectedUse string - expectedShort string - }{ - { - name: "cluster status command properties", - expectedUse: "status", - expectedShort: "Check cluster status", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expectedUse, clusterStatusCmd.Use) - assert.Equal(t, tt.expectedShort, clusterStatusCmd.Short) - assert.NotEmpty(t, clusterStatusCmd.Long) - assert.NotNil(t, clusterStatusCmd.RunE) - }) - } -} - -func TestClusterCommandInitialization(t *testing.T) { - // Test that cluster command is properly added to root - found := false - for _, cmd := range rootCmd.Commands() { - if cmd.Use == "cluster" { - found = true - break - } - } - assert.True(t, found, "cluster command should be added to root command") - - // Test that subcommands are properly added to cluster command - subcommands := clusterCmd.Commands() - expectedSubcommands := []string{"join", "status"} - - for _, expected := range expectedSubcommands { - found := false - for _, cmd := range subcommands { - if cmd.Use == expected { - found = true - break - } - } - assert.True(t, found, "subcommand %s should be added to cluster command", expected) - } -} - -func TestClusterJoinCommandFlags(t *testing.T) { - tests := []struct { - name string - flagName string - shortFlag string - usage string - required bool - }{ - { - name: "leader flag", - flagName: "leader", - shortFlag: "L", // Changed to avoid conflict with log-level flag - usage: "Leader address (required)", - required: true, - }, - { - name: "node-id flag", - flagName: "node-id", - shortFlag: "n", - usage: "Node ID to join (required)", - required: true, - }, - { - name: "node-addr flag", - flagName: "node-addr", - shortFlag: "a", - usage: "Node address (required)", - required: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - flag := clusterJoinCmd.Flags().Lookup(tt.flagName) - require.NotNil(t, flag, "Flag %s should exist", tt.flagName) - - assert.Equal(t, tt.shortFlag, flag.Shorthand, "Short flag mismatch for %s", tt.flagName) - assert.Contains(t, flag.Usage, tt.usage, "Usage description mismatch for %s", tt.flagName) - }) - } -} - -func TestClusterStatusCommandFlags(t *testing.T) { - tests := []struct { - name string - flagName string - shortFlag string - defaultValue string - usage string - }{ - { - name: "addr flag", - flagName: "addr", - shortFlag: "a", - defaultValue: "127.0.0.1:9001", - usage: "Admin address to check status", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - flag := clusterStatusCmd.Flags().Lookup(tt.flagName) - require.NotNil(t, flag, "Flag %s should exist", tt.flagName) - - assert.Equal(t, tt.defaultValue, flag.DefValue, "Default value mismatch for %s", tt.flagName) - assert.Equal(t, tt.shortFlag, flag.Shorthand, "Short flag mismatch for %s", tt.flagName) - assert.Contains(t, flag.Usage, tt.usage, "Usage description mismatch for %s", tt.flagName) - }) - } -} - -func TestRunClusterJoinWithoutServer(t *testing.T) { - if testing.Short() { - t.Skip("Skipping test in short mode") - } - - // Test cluster join when no server is running - // Use t.Parallel() to ensure proper test isolation - t.Parallel() - - // Thread-safe access to save original global variables - globalVarsMutex.Lock() - originalLeaderAddr := leaderAddr - originalJoinNodeID := joinNodeID - originalJoinNodeAddr := joinNodeAddr - - // Set the global variables for this test with unique ports to avoid conflicts - leaderAddr = "127.0.0.1:18001" - joinNodeID = "test-node-join" - joinNodeAddr = "127.0.0.1:18002" - globalVarsMutex.Unlock() - - // Ensure cleanup even if test panics - defer func() { - if r := recover(); r != nil { - t.Errorf("Test panicked: %v", r) - } - globalVarsMutex.Lock() - leaderAddr = originalLeaderAddr - joinNodeID = originalJoinNodeID - joinNodeAddr = originalJoinNodeAddr - globalVarsMutex.Unlock() - }() - - // Validate that variables are properly set before calling function - globalVarsMutex.RLock() - currentLeaderAddr := leaderAddr - currentJoinNodeID := joinNodeID - currentJoinNodeAddr := joinNodeAddr - globalVarsMutex.RUnlock() - - assert.NotEmpty(t, currentLeaderAddr, "leaderAddr should not be empty") - assert.NotEmpty(t, currentJoinNodeID, "joinNodeID should not be empty") - assert.NotEmpty(t, currentJoinNodeAddr, "joinNodeAddr should not be empty") - - cmd := &cobra.Command{Use: "test"} - assert.NotNil(t, cmd, "cmd should not be nil") - - err := runClusterJoin(cmd, []string{}) - - assert.Contains(t, err.Error(), "connecting to admin server") -} - -func TestRunClusterStatusWithoutServer(t *testing.T) { - if testing.Short() { - t.Skip("Skipping test in short mode") - } - - // Test cluster status when no server is running - // Use t.Parallel() to ensure proper test isolation - t.Parallel() - - // Thread-safe access to save original global variable - globalVarsMutex.Lock() - originalStatusAddr := statusAddr - - statusAddr = "127.0.0.1:19999" // Use a unique unused port - globalVarsMutex.Unlock() - - // Ensure cleanup even if test panics - defer func() { - if r := recover(); r != nil { - t.Errorf("Test panicked: %v", r) - } - globalVarsMutex.Lock() - statusAddr = originalStatusAddr - globalVarsMutex.Unlock() - }() - - // Validate that variable is properly set before calling function - globalVarsMutex.RLock() - currentStatusAddr := statusAddr - globalVarsMutex.RUnlock() - - assert.NotEmpty(t, currentStatusAddr, "statusAddr should not be empty") - - cmd := &cobra.Command{Use: "test"} - assert.NotNil(t, cmd, "cmd should not be nil") - - err := runClusterStatus(cmd, []string{}) - - assert.Contains(t, err.Error(), "connecting to admin server") -} - -func TestDeriveAdminAddr(t *testing.T) { - tests := []struct { - name string - raftAddr string - expected string - }{ - { - name: "valid raft address", - raftAddr: "127.0.0.1:8001", - expected: "127.0.0.1:9001", - }, - { - name: "localhost raft address", - raftAddr: "localhost:8001", - expected: "localhost:9001", - }, - { - name: "invalid raft address", - raftAddr: "invalid-address", - expected: "127.0.0.1:9001", // Default - }, - { - name: "empty raft address", - raftAddr: "", - expected: "127.0.0.1:9001", // Default - }, - { - name: "raft address without port", - raftAddr: "127.0.0.1", - expected: "127.0.0.1:9001", // Default - }, - { - name: "raft address with multiple colons", - raftAddr: "127.0.0.1:8001:extra", - expected: "127.0.0.1:9001", // Default (invalid format) - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := deriveAdminAddr(tt.raftAddr) - assert.Equal(t, tt.expected, result) - }) - } -} - -func TestClusterJoinRequiredFlags(t *testing.T) { - // Test that required flags are properly marked by checking flag annotations - requiredFlags := []string{"leader", "node-id", "node-addr"} - - for _, flagName := range requiredFlags { - t.Run("required_flag_"+flagName, func(t *testing.T) { - flag := clusterJoinCmd.Flags().Lookup(flagName) - require.NotNil(t, flag, "Flag %s should exist", flagName) - - // Check if the flag is marked as required by trying to validate without it - // We create a fresh command to avoid global state interference - testCmd := &cobra.Command{Use: "test"} - testCmd.Flags().StringP(flagName, flag.Shorthand, "", flag.Usage) - testCmd.MarkFlagRequired(flagName) - - // This should fail when the required flag is missing - err := testCmd.ParseFlags([]string{}) - assert.NoError(t, err, "Parsing flags should not error") - - // The validation happens during Execute, but we can't test that easily - // So we just verify the flag exists and has the right properties - assert.NotEmpty(t, flag.Usage, "Flag should have usage text") - }) - } -} - -func TestClusterCommandUsage(t *testing.T) { - // Test cluster command usage - usage := clusterCmd.UsageString() - assert.Contains(t, usage, "cluster") - assert.Contains(t, usage, "Available Commands") -} - -func TestClusterJoinCommandUsage(t *testing.T) { - // Test cluster join command usage - just check that it has the basic structure - assert.NotEmpty(t, clusterJoinCmd.Use) - assert.NotEmpty(t, clusterJoinCmd.Short) - assert.NotEmpty(t, clusterJoinCmd.Long) -} - -func TestClusterStatusCommandUsage(t *testing.T) { - // Test cluster status command usage - usage := clusterStatusCmd.UsageString() - assert.Contains(t, usage, "status") -} - -func TestClusterCommandHelp(t *testing.T) { - // Test that help doesn't panic - assert.NotPanics(t, func() { - clusterCmd.SetArgs([]string{"--help"}) - clusterCmd.Execute() - }) -} - -func TestClusterJoinCommandHelp(t *testing.T) { - // Test that help doesn't panic - assert.NotPanics(t, func() { - clusterJoinCmd.SetArgs([]string{"--help"}) - clusterJoinCmd.Execute() - }) -} - -func TestClusterStatusCommandHelp(t *testing.T) { - // Test that help doesn't panic - assert.NotPanics(t, func() { - clusterStatusCmd.SetArgs([]string{"--help"}) - clusterStatusCmd.Execute() - }) -} - -func TestClusterJoinCommandValidation(t *testing.T) { - // Test that the required flags are properly configured - requiredFlags := []string{"leader", "node-id", "node-addr"} - - for _, flagName := range requiredFlags { - t.Run("flag_"+flagName+"_exists", func(t *testing.T) { - flag := clusterJoinCmd.Flags().Lookup(flagName) - assert.NotNil(t, flag, "Flag %s should exist", flagName) - assert.NotEmpty(t, flag.Usage, "Flag should have usage text") - }) - } -} - -func TestClusterStatusCommandValidation(t *testing.T) { - tests := []struct { - name string - addr string - wantErr bool - }{ - { - name: "default address", - addr: "", - wantErr: false, // Will fail at connection, not validation - }, - { - name: "custom address", - addr: "127.0.0.1:9002", - wantErr: false, // Will fail at connection, not validation - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - args := []string{} - if tt.addr != "" { - args = append(args, "--addr", tt.addr) - } - - clusterStatusCmd.SetArgs(args) - err := clusterStatusCmd.Execute() - - // Should fail at connection, not validation - if err != nil { - assert.Contains(t, err.Error(), "connecting to admin server") - } - }) - } -} - -func TestClusterCommandStructure(t *testing.T) { - // Test command structure - assert.NotEmpty(t, clusterCmd.Use) - assert.NotEmpty(t, clusterCmd.Short) - assert.NotEmpty(t, clusterCmd.Long) - - // Test subcommands structure - assert.NotEmpty(t, clusterJoinCmd.Use) - assert.NotEmpty(t, clusterJoinCmd.Short) - assert.NotEmpty(t, clusterJoinCmd.Long) - assert.NotNil(t, clusterJoinCmd.RunE) - - assert.NotEmpty(t, clusterStatusCmd.Use) - assert.NotEmpty(t, clusterStatusCmd.Short) - assert.NotEmpty(t, clusterStatusCmd.Long) - assert.NotNil(t, clusterStatusCmd.RunE) -} - -func TestGlobalVariables(t *testing.T) { - // Test that global variables exist and can be set - // Thread-safe access to save original global variables - globalVarsMutex.Lock() - originalLeader := leaderAddr - originalNodeID := joinNodeID - originalNodeAddr := joinNodeAddr - originalStatusAddr := statusAddr - - // Test setting variables - leaderAddr = "test-leader" - joinNodeID = "test-node-id" - joinNodeAddr = "test-node-addr" - statusAddr = "test-status-addr" - globalVarsMutex.Unlock() - - defer func() { - globalVarsMutex.Lock() - leaderAddr = originalLeader - joinNodeID = originalNodeID - joinNodeAddr = originalNodeAddr - statusAddr = originalStatusAddr - globalVarsMutex.Unlock() - }() - - // Verify variables are set correctly - globalVarsMutex.RLock() - assert.Equal(t, "test-leader", leaderAddr) - assert.Equal(t, "test-node-id", joinNodeID) - assert.Equal(t, "test-node-addr", joinNodeAddr) - assert.Equal(t, "test-status-addr", statusAddr) - globalVarsMutex.RUnlock() -} - -func TestClusterJoinWithValidFlags(t *testing.T) { - if testing.Short() { - t.Skip("Skipping test in short mode") - } - - // Test cluster join by verifying the function logic directly - // Use t.Parallel() to ensure proper test isolation - t.Parallel() - - // Thread-safe access to save original global variables - globalVarsMutex.Lock() - originalLeader := leaderAddr - originalNodeID := joinNodeID - originalNodeAddr := joinNodeAddr - - // Set the global variables (simulating flag parsing) with unique ports - leaderAddr = "127.0.0.1:28001" - joinNodeID = "test-node-valid" - joinNodeAddr = "127.0.0.1:28002" - globalVarsMutex.Unlock() - - defer func() { - if r := recover(); r != nil { - t.Errorf("Test panicked: %v", r) - } - globalVarsMutex.Lock() - leaderAddr = originalLeader - joinNodeID = originalNodeID - joinNodeAddr = originalNodeAddr - globalVarsMutex.Unlock() - }() - - cmd := &cobra.Command{Use: "test"} - assert.NotNil(t, cmd, "cmd should not be nil") - - err := runClusterJoin(cmd, []string{}) - - // Should fail at connection attempt, not flag validation - assert.Error(t, err, "should error when cannot connect to admin server") - assert.Contains(t, err.Error(), "connecting to admin server") -} - -func TestNetworkTimeout(t *testing.T) { - // Test that network operations timeout appropriately - start := time.Now() - - // This should timeout quickly since the address doesn't exist - _, err := net.DialTimeout("tcp", "192.0.2.1:9999", 100*time.Millisecond) - - elapsed := time.Since(start) - assert.Error(t, err) - assert.True(t, elapsed < 200*time.Millisecond, "Should timeout within reasonable time") -} - -func TestFlagShorthandUniqueness(t *testing.T) { - // Test that flag shorthands don't conflict within commands - joinFlags := clusterJoinCmd.Flags() - statusFlags := clusterStatusCmd.Flags() - - // Check join command flags - leaderFlag := joinFlags.Lookup("leader") - nodeIDFlag := joinFlags.Lookup("node-id") - nodeAddrFlag := joinFlags.Lookup("node-addr") - - assert.Equal(t, "L", leaderFlag.Shorthand) - assert.Equal(t, "n", nodeIDFlag.Shorthand) - assert.Equal(t, "a", nodeAddrFlag.Shorthand) - - // Check status command flags - addrFlag := statusFlags.Lookup("addr") - assert.Equal(t, "a", addrFlag.Shorthand) - - // Note: Both join and status use -a, but they're in different commands so it's okay -} - -func TestClusterCommandAliases(t *testing.T) { - // Test that commands don't have conflicting aliases - assert.Empty(t, clusterCmd.Aliases, "Cluster command should not have aliases") - assert.Empty(t, clusterJoinCmd.Aliases, "Cluster join command should not have aliases") - assert.Empty(t, clusterStatusCmd.Aliases, "Cluster status command should not have aliases") -} - -func TestInvalidNetworkAddresses(t *testing.T) { - tests := []struct { - name string - address string - }{ - {"invalid host", "invalid-host:9001"}, - {"invalid port", "127.0.0.1:999999"}, - {"no port", "127.0.0.1"}, - {"empty", ""}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Test that invalid addresses are handled gracefully - _, err := net.DialTimeout("tcp", tt.address, 10*time.Millisecond) - assert.Error(t, err, "Should error for invalid address: %s", tt.address) - }) - } -} - -func TestStringValidation(t *testing.T) { - // Test string validation in cluster operations - tests := []struct { - name string - value string - valid bool - }{ - {"valid node ID", "node-1", true}, - {"valid address", "127.0.0.1:8001", true}, - {"empty node ID", "", false}, - {"node ID with spaces", "node 1", false}, - {"very long node ID", strings.Repeat("a", 1000), false}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if tt.valid { - assert.NotEmpty(t, strings.TrimSpace(tt.value), "Valid value should not be empty when trimmed") - assert.NotContains(t, tt.value, " ", "Valid node ID should not contain spaces") - } else { - if tt.value != "" && !strings.Contains(tt.value, " ") && len(tt.value) < 100 { - // Only test non-empty, non-space, reasonable length strings - assert.NotEmpty(t, tt.value) - } - } - }) - } -} diff --git a/cmd/pickbox/main.go b/cmd/pickbox/main.go deleted file mode 100644 index d3ac358..0000000 --- a/cmd/pickbox/main.go +++ /dev/null @@ -1,44 +0,0 @@ -package main - -import ( - "fmt" - "os" - - "github.com/spf13/cobra" -) - -var ( - version = "dev" - commit = "unknown" - date = "unknown" -) - -// rootCmd represents the base command when called without any subcommands -var rootCmd = &cobra.Command{ - Use: "pickbox", - Short: "A distributed file storage system similar to Dropbox", - Long: `Pickbox is a distributed file storage system with replication and consistency guarantees. -It supports file operations (OPEN, READ, WRITE, CLOSE) across multiple nodes using RAFT consensus. - -Features: -- Distributed storage with multiple nodes -- File replication and consistency -- RAFT consensus for distributed coordination -- Real-time file watching and replication -- Admin interface and monitoring -- Cluster management`, - Version: fmt.Sprintf("%s (commit: %s, built: %s)", version, commit, date), -} - -func main() { - if err := rootCmd.Execute(); err != nil { - fmt.Fprintln(os.Stderr, err) - os.Exit(1) - } -} - -func init() { - // Add global flags - rootCmd.PersistentFlags().StringP("log-level", "l", "info", "Set log level (debug, info, warn, error)") - rootCmd.PersistentFlags().StringP("data-dir", "d", "data", "Data directory for storage") -} diff --git a/cmd/pickbox/main_test.go b/cmd/pickbox/main_test.go deleted file mode 100644 index 96a0b6b..0000000 --- a/cmd/pickbox/main_test.go +++ /dev/null @@ -1,324 +0,0 @@ -package main - -import ( - "os" - "strings" - "testing" - - "github.com/spf13/cobra" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestRootCommand(t *testing.T) { - tests := []struct { - name string - expectedUse string - expectedShort string - }{ - { - name: "root command properties", - expectedUse: "pickbox", - expectedShort: "A distributed file storage system similar to Dropbox", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expectedUse, rootCmd.Use) - assert.Equal(t, tt.expectedShort, rootCmd.Short) - assert.NotEmpty(t, rootCmd.Long) - assert.NotEmpty(t, rootCmd.Version) - }) - } -} - -func TestRootCommandVersion(t *testing.T) { - // Test version string format - assert.Contains(t, rootCmd.Version, version) - assert.Contains(t, rootCmd.Version, commit) - assert.Contains(t, rootCmd.Version, date) -} - -func TestRootCommandLong(t *testing.T) { - expectedFeatures := []string{ - "Distributed storage with multiple nodes", - "File replication and consistency", - "RAFT consensus", - "Real-time file watching", - "Admin interface", - "Cluster management", - } - - for _, feature := range expectedFeatures { - assert.Contains(t, rootCmd.Long, feature, "Long description should mention %s", feature) - } -} - -func TestGlobalFlags(t *testing.T) { - tests := []struct { - name string - flagName string - shortFlag string - defaultValue string - usage string - }{ - { - name: "log-level flag", - flagName: "log-level", - shortFlag: "l", - defaultValue: "info", - usage: "Set log level (debug, info, warn, error)", - }, - { - name: "data-dir flag", - flagName: "data-dir", - shortFlag: "d", - defaultValue: "data", - usage: "Data directory for storage", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - flag := rootCmd.PersistentFlags().Lookup(tt.flagName) - require.NotNil(t, flag, "Flag %s should exist", tt.flagName) - - assert.Equal(t, tt.defaultValue, flag.DefValue, "Default value mismatch for %s", tt.flagName) - assert.Equal(t, tt.usage, flag.Usage, "Usage description mismatch for %s", tt.flagName) - assert.Equal(t, tt.shortFlag, flag.Shorthand, "Short flag mismatch for %s", tt.flagName) - }) - } -} - -func TestRootCommandSubcommands(t *testing.T) { - // Test that expected subcommands are registered - expectedSubcommands := []string{"script", "cluster", "node", "multi-replication"} - - for _, expectedCmd := range expectedSubcommands { - found := false - for _, cmd := range rootCmd.Commands() { - if cmd.Use == expectedCmd { - found = true - break - } - } - if !found && expectedCmd != "node" && expectedCmd != "multi-replication" { - // node and multi-replication might be defined in other files - t.Logf("Warning: Expected subcommand '%s' not found", expectedCmd) - } - } -} - -func TestRootCommandExecution(t *testing.T) { - // Test help command execution - rootCmd.SetArgs([]string{"--help"}) - - // Capture output by temporarily redirecting - originalOut := os.Stdout - defer func() { os.Stdout = originalOut }() - - // Test that help command doesn't panic - assert.NotPanics(t, func() { - rootCmd.Execute() - }) -} - -func TestVersionFlag(t *testing.T) { - // Test version flag - rootCmd.SetArgs([]string{"--version"}) - - assert.NotPanics(t, func() { - rootCmd.Execute() - }) -} - -func TestMainFunction(t *testing.T) { - // Test that main function doesn't panic with valid commands - assert.NotPanics(t, func() { - // Save original args - originalArgs := os.Args - defer func() { os.Args = originalArgs }() - - // Set test args - os.Args = []string{"pickbox", "--help"} - - // This would normally call os.Exit, but in test we just verify no panic - // We can't easily test the actual main() function without modifying it - }) -} - -func TestGlobalFlagValidation(t *testing.T) { - tests := []struct { - name string - flagName string - value string - wantErr bool - }{ - { - name: "valid log level", - flagName: "log-level", - value: "debug", - wantErr: false, - }, - { - name: "valid data dir", - flagName: "data-dir", - value: "/tmp/test", - wantErr: false, - }, - { - name: "empty data dir", - flagName: "data-dir", - value: "", - wantErr: false, // Empty values are typically allowed for flags - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Create a new command instance to avoid side effects - testCmd := &cobra.Command{ - Use: "test", - } - testCmd.PersistentFlags().StringP("log-level", "l", "info", "Set log level") - testCmd.PersistentFlags().StringP("data-dir", "d", "data", "Data directory") - - testCmd.SetArgs([]string{"--" + tt.flagName, tt.value}) - - err := testCmd.Execute() - if tt.wantErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - } - }) - } -} - -func TestCommandStructure(t *testing.T) { - // Test that rootCmd has required fields - assert.NotEmpty(t, rootCmd.Use, "Use field should not be empty") - assert.NotEmpty(t, rootCmd.Short, "Short description should not be empty") - assert.NotEmpty(t, rootCmd.Long, "Long description should not be empty") - assert.NotEmpty(t, rootCmd.Version, "Version should not be empty") -} - -func TestVersionVariables(t *testing.T) { - // Test that version variables are properly set (even if defaults) - assert.NotEmpty(t, version, "version variable should be set") - assert.NotEmpty(t, commit, "commit variable should be set") - assert.NotEmpty(t, date, "date variable should be set") - - // Test default values - if version == "dev" { - assert.Equal(t, "dev", version, "default version should be 'dev'") - } - if commit == "unknown" { - assert.Equal(t, "unknown", commit, "default commit should be 'unknown'") - } - if date == "unknown" { - assert.Equal(t, "unknown", date, "default date should be 'unknown'") - } -} - -func TestFlagInheritance(t *testing.T) { - // Test that persistent flags exist on root command - logLevelFlag := rootCmd.PersistentFlags().Lookup("log-level") - dataDirFlag := rootCmd.PersistentFlags().Lookup("data-dir") - - assert.NotNil(t, logLevelFlag, "log-level flag should exist on root command") - assert.NotNil(t, dataDirFlag, "data-dir flag should exist on root command") - - // Test that some key subcommands exist - subcommandNames := []string{"script", "cluster"} - for _, name := range subcommandNames { - found := false - for _, cmd := range rootCmd.Commands() { - if cmd.Use == name { - found = true - break - } - } - assert.True(t, found, "Subcommand %s should exist", name) - } -} - -func TestLogLevelValidValues(t *testing.T) { - validLogLevels := []string{"debug", "info", "warn", "error"} - - for _, level := range validLogLevels { - t.Run("log_level_"+level, func(t *testing.T) { - // This tests that the flag accepts these values - testCmd := &cobra.Command{Use: "test"} - testCmd.Flags().StringP("log-level", "l", "info", "Set log level") - testCmd.SetArgs([]string{"--log-level", level}) - - err := testCmd.Execute() - assert.NoError(t, err, "Should accept log level: %s", level) - }) - } -} - -func TestCommandUsageText(t *testing.T) { - // Test that usage text is properly formatted - usage := rootCmd.UsageString() - assert.Contains(t, usage, "pickbox", "Usage should contain command name") - assert.Contains(t, usage, "Available Commands", "Usage should list available commands") - assert.Contains(t, usage, "Flags", "Usage should list available flags") -} - -func TestCommandAliases(t *testing.T) { - // Test that rootCmd doesn't have conflicting aliases - assert.Empty(t, rootCmd.Aliases, "Root command should not have aliases") - - // Check subcommands for proper alias setup - for _, cmd := range rootCmd.Commands() { - if len(cmd.Aliases) > 0 { - for _, alias := range cmd.Aliases { - assert.NotEmpty(t, alias, "Aliases should not be empty strings") - assert.NotEqual(t, cmd.Use, alias, "Alias should not match command name") - } - } - } -} - -func TestBashCompletion(t *testing.T) { - // Test that bash completion doesn't panic - assert.NotPanics(t, func() { - rootCmd.SetArgs([]string{"completion", "bash"}) - rootCmd.Execute() - }) -} - -func TestErrorHandling(t *testing.T) { - // Test with invalid flag - rootCmd.SetArgs([]string{"--invalid-flag"}) - - err := rootCmd.Execute() - assert.Error(t, err, "Should return error for invalid flag") - assert.Contains(t, err.Error(), "unknown flag", "Error should mention unknown flag") -} - -func TestHelpCommand(t *testing.T) { - // Test help command - rootCmd.SetArgs([]string{"help"}) - - assert.NotPanics(t, func() { - rootCmd.Execute() - }) -} - -func TestCommandValidation(t *testing.T) { - // Test command structure validation - // Note: Root command may not be runnable if it doesn't have a Run function - - // Test that required fields are not empty - assert.NotEmpty(t, strings.TrimSpace(rootCmd.Use)) - assert.NotEmpty(t, strings.TrimSpace(rootCmd.Short)) - assert.NotEmpty(t, strings.TrimSpace(rootCmd.Long)) - - // Test that the command has proper structure - assert.NotNil(t, rootCmd.Commands(), "Root command should have subcommands") - assert.True(t, len(rootCmd.Commands()) > 0, "Root command should have at least one subcommand") -} diff --git a/cmd/pickbox/multi_replication_test.go b/cmd/pickbox/multi_replication_test.go deleted file mode 100644 index 17abfac..0000000 --- a/cmd/pickbox/multi_replication_test.go +++ /dev/null @@ -1,578 +0,0 @@ -package main - -import ( - "crypto/sha256" - "encoding/hex" - "encoding/json" - "io" - "os" - "path/filepath" - "testing" - - "github.com/hashicorp/raft" - "github.com/sirupsen/logrus" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// Test for MultiConfig validation -func TestMultiConfig_Validation(t *testing.T) { - tests := []struct { - name string - config MultiConfig - wantErr bool - }{ - { - name: "valid config", - config: MultiConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: false, - }, - { - name: "invalid config - empty data dir", - config: MultiConfig{ - DataDir: "", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - empty node ID", - config: MultiConfig{ - DataDir: "/tmp/test", - NodeID: "", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - zero port", - config: MultiConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 0, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - zero admin port", - config: MultiConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 0, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - zero monitor port", - config: MultiConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 0, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - zero dashboard port", - config: MultiConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 0, - LogLevel: "info", - }, - wantErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := validateMultiConfig(tt.config) - if tt.wantErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - } - }) - } -} - -// Test for MultiApplication creation -func TestNewMultiApplication(t *testing.T) { - tests := []struct { - name string - config MultiConfig - wantErr bool - }{ - { - name: "invalid config should fail", - config: MultiConfig{ - DataDir: "", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - app, err := NewMultiApplication(tt.config) - if tt.wantErr { - assert.Error(t, err) - assert.Nil(t, app) - } else { - assert.NoError(t, err) - assert.NotNil(t, app) - } - }) - } -} - -// Test command structure for multi-directional replication -func TestCommand_JSONSerialization(t *testing.T) { - tests := []struct { - name string - command interface{} - wantErr bool - }{ - { - name: "write command", - command: struct { - Op string `json:"op"` - Path string `json:"path"` - Data []byte `json:"data"` - Hash string `json:"hash"` - NodeID string `json:"node_id"` - Sequence int64 `json:"sequence"` - }{ - Op: "write", - Path: "test.txt", - Data: []byte("test content"), - Hash: "hash123", - NodeID: "node1", - Sequence: 1, - }, - wantErr: false, - }, - { - name: "delete command", - command: struct { - Op string `json:"op"` - Path string `json:"path"` - Data []byte `json:"data"` - Hash string `json:"hash"` - NodeID string `json:"node_id"` - Sequence int64 `json:"sequence"` - }{ - Op: "delete", - Path: "test.txt", - Data: nil, - Hash: "", - NodeID: "node1", - Sequence: 2, - }, - wantErr: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Test JSON marshaling - data, err := json.Marshal(tt.command) - if tt.wantErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - assert.NotEmpty(t, data) - - // Test JSON unmarshaling - var restored interface{} - err = json.Unmarshal(data, &restored) - assert.NoError(t, err) - } - }) - } -} - -// Test content hashing function -func TestHashContent(t *testing.T) { - tests := []struct { - name string - data []byte - want string - }{ - { - name: "empty data", - data: []byte{}, - want: "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", - }, - { - name: "hello world", - data: []byte("hello world"), - want: "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9", - }, - { - name: "test content", - data: []byte("test content"), - want: computeExpectedHash([]byte("test content")), - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := hashContent(tt.data) - assert.Equal(t, tt.want, got) - }) - } -} - -// Test deriveMultiAdminAddress function -func TestDeriveMultiAdminAddress(t *testing.T) { - tests := []struct { - name string - raftAddr string - want string - }{ - { - name: "valid address", - raftAddr: "127.0.0.1:8001", - want: "127.0.0.1:9001", - }, - { - name: "different port", - raftAddr: "127.0.0.1:8002", - want: "127.0.0.1:9002", - }, - { - name: "different host", - raftAddr: "192.168.1.1:8001", - want: "192.168.1.1:9001", - }, - { - name: "invalid address", - raftAddr: "invalid", - want: "127.0.0.1:9001", - }, - { - name: "invalid port", - raftAddr: "127.0.0.1:invalid", - want: "127.0.0.1:9001", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := deriveMultiAdminAddress(tt.raftAddr) - assert.Equal(t, tt.want, got) - }) - } -} - -// Test runMultiReplication function -func TestRunMultiReplication(t *testing.T) { - if testing.Short() { - t.Skip("Skipping integration test in short mode") - } - - tempDir := t.TempDir() - logger := logrus.New() - logger.SetOutput(io.Discard) // Suppress logs during testing - - tests := []struct { - name string - nodeID string - port int - join string - dataDir string - wantErr bool - }{ - { - name: "valid parameters", - nodeID: "test-node", - port: 8101, // Use different port to avoid conflicts - join: "", - dataDir: tempDir, - wantErr: false, - }, - { - name: "empty node ID", - nodeID: "", - port: 8102, - join: "", - dataDir: tempDir, - wantErr: true, - }, - { - name: "invalid port", - nodeID: "test-node", - port: 0, - join: "", - dataDir: tempDir, - wantErr: true, - }, - { - name: "empty data dir", - nodeID: "test-node", - port: 8103, - join: "", - dataDir: "", - wantErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := runMultiReplication(tt.nodeID, tt.port, tt.join, tt.dataDir, logger) - if tt.wantErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - } - }) - } -} - -// Test createMultiWelcomeFile function -func TestCreateMultiWelcomeFile(t *testing.T) { - tempDir := t.TempDir() - logger := logrus.New() - logger.SetOutput(io.Discard) // Suppress logs during testing - - nodeID := "test-node" - createMultiWelcomeFile(tempDir, nodeID, logger) - - // Check if welcome file was created - welcomeFile := filepath.Join(tempDir, "welcome.txt") - assert.FileExists(t, welcomeFile) - - // Check file content - content, err := os.ReadFile(welcomeFile) - require.NoError(t, err) - assert.Contains(t, string(content), nodeID) - assert.Contains(t, string(content), "Multi-Directional Distributed Storage") -} - -// Test multiRaftWrapper -func TestMultiRaftWrapper(t *testing.T) { - // Create a mock raft manager for testing - // Note: This is a simplified test as creating a full raft manager is complex - t.Run("wrapper interface", func(t *testing.T) { - // Test that multiRaftWrapper implements the expected interface - var wrapper interface{} = &multiRaftWrapper{} - - // Check that it has the expected methods - assert.NotNil(t, wrapper) - }) -} - -// Test multiForwarderWrapper -func TestMultiForwarderWrapper(t *testing.T) { - logger := logrus.New() - logger.SetOutput(io.Discard) // Suppress logs during testing - - wrapper := &multiForwarderWrapper{logger: logger} - - // Test wrapper with nil logger (should not panic) - wrapperNoLogger := &multiForwarderWrapper{logger: nil} - assert.NotNil(t, wrapperNoLogger) - - // Test that wrapper implements the expected interface - assert.NotNil(t, wrapper) -} - -// Test raft wrapper implementations -func TestRaftWrapperImplementations(t *testing.T) { - t.Run("multiRaftWrapper methods", func(t *testing.T) { - // Test that multiRaftWrapper has the expected methods - wrapper := &multiRaftWrapper{} - assert.NotNil(t, wrapper) - - // Test nil safety - these should not panic when rm is nil - assert.Equal(t, raft.Shutdown, wrapper.State()) - assert.Equal(t, raft.ServerAddress(""), wrapper.Leader()) - }) - - t.Run("multiForwarderWrapper methods", func(t *testing.T) { - logger := logrus.New() - logger.SetOutput(io.Discard) - - wrapper := &multiForwarderWrapper{logger: logger} - assert.NotNil(t, wrapper) - - // Test with nil logger - wrapperNil := &multiForwarderWrapper{logger: nil} - assert.NotNil(t, wrapperNil) - }) -} - -// Benchmark tests -func BenchmarkHashContent(b *testing.B) { - data := []byte("test content for benchmarking") - - b.ResetTimer() - for i := 0; i < b.N; i++ { - hashContent(data) - } -} - -func BenchmarkDeriveMultiAdminAddress(b *testing.B) { - addr := "127.0.0.1:8001" - - b.ResetTimer() - for i := 0; i < b.N; i++ { - deriveMultiAdminAddress(addr) - } -} - -func BenchmarkMultiConfigValidation(b *testing.B) { - config := MultiConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - validateMultiConfig(config) - } -} - -// Helper functions for testing -func hashContent(data []byte) string { - hash := sha256.Sum256(data) - return hex.EncodeToString(hash[:]) -} - -func computeExpectedHash(data []byte) string { - hash := sha256.Sum256(data) - return hex.EncodeToString(hash[:]) -} - -// Integration tests -func TestMultiApplicationIntegration(t *testing.T) { - if testing.Short() { - t.Skip("Skipping integration test in short mode") - } - - tempDir := t.TempDir() - - config := MultiConfig{ - DataDir: tempDir, - NodeID: "test-node", - Port: 8201, // Use different port to avoid conflicts - AdminPort: 9201, - MonitorPort: 8280, - DashboardPort: 8290, - LogLevel: "error", // Use error level to reduce test output - BootstrapCluster: true, - } - - t.Run("application lifecycle", func(t *testing.T) { - app, err := NewMultiApplication(config) - require.NoError(t, err) - require.NotNil(t, app) - - // Test components initialization - assert.NotNil(t, app.config) - assert.NotNil(t, app.logger) - assert.NotNil(t, app.raftManager) - assert.NotNil(t, app.stateManager) - assert.NotNil(t, app.fileWatcher) - assert.NotNil(t, app.adminServer) - assert.NotNil(t, app.monitor) - assert.NotNil(t, app.dashboard) - - // Test that data directory was created - assert.DirExists(t, tempDir) - - // Test stop functionality - err = app.Stop() - assert.NoError(t, err) - }) -} - -// Test edge cases and error handling -func TestMultiApplicationErrorHandling(t *testing.T) { - if testing.Short() { - t.Skip("Skipping integration test in short mode") - } - - t.Run("invalid data directory", func(t *testing.T) { - config := MultiConfig{ - DataDir: "/invalid/path/that/does/not/exist/and/cannot/be/created", - NodeID: "test-node", - Port: 8301, - AdminPort: 9301, - MonitorPort: 8380, - DashboardPort: 8390, - LogLevel: "error", - BootstrapCluster: true, - } - - app, err := NewMultiApplication(config) - assert.Error(t, err) - assert.Nil(t, app) - }) - - t.Run("invalid log level", func(t *testing.T) { - tempDir := t.TempDir() - config := MultiConfig{ - DataDir: tempDir, - NodeID: "test-node", - Port: 8302, - AdminPort: 9302, - MonitorPort: 8381, - DashboardPort: 8391, - LogLevel: "invalid-level", - BootstrapCluster: true, - } - - app, err := NewMultiApplication(config) - require.NoError(t, err) - require.NotNil(t, app) - - // Should default to info level - assert.Equal(t, logrus.InfoLevel, app.logger.Level) - - app.Stop() - }) -} diff --git a/cmd/pickbox/node.go b/cmd/pickbox/node.go deleted file mode 100644 index f4a6e97..0000000 --- a/cmd/pickbox/node.go +++ /dev/null @@ -1,507 +0,0 @@ -package main - -import ( - "errors" - "fmt" - "os" - "os/signal" - "path/filepath" - "strconv" - "strings" - "syscall" - "time" - - "github.com/addityasingh/pickbox/pkg/admin" - "github.com/addityasingh/pickbox/pkg/monitoring" - "github.com/addityasingh/pickbox/pkg/storage" - "github.com/addityasingh/pickbox/pkg/watcher" - "github.com/hashicorp/raft" - "github.com/sirupsen/logrus" - "github.com/spf13/cobra" -) - -var nodeCmd = &cobra.Command{ - Use: "node", - Short: "Node management commands", - Long: `Commands for managing Pickbox nodes including starting, stopping, and configuration`, -} - -var nodeStartCmd = &cobra.Command{ - Use: "start", - Short: "Start a Pickbox node with full features", - Long: `Start a Pickbox node with all features including: -- Multi-directional file replication -- Admin interface -- Monitoring and dashboard -- Cluster management`, - RunE: runNodeStart, -} - -var nodeMultiCmd = &cobra.Command{ - Use: "multi", - Short: "Start a node with multi-directional replication", - Long: `Start a node with multi-directional replication capabilities. -This mode provides real-time file watching and multi-directional replication across all nodes.`, - RunE: runNodeMulti, -} - -// Node start command flags -var ( - nodeID string - port int - adminPort int - monitorPort int - dashboardPort int - joinAddr string - bootstrapCluster bool -) - -// Live node command flags -var ( - liveNodeID string - livePort int - liveJoin string -) - -func init() { - rootCmd.AddCommand(nodeCmd) - nodeCmd.AddCommand(nodeStartCmd) - nodeCmd.AddCommand(nodeMultiCmd) - - // Node start command flags - nodeStartCmd.Flags().StringVarP(&nodeID, "node-id", "n", "", "Node ID (required)") - nodeStartCmd.Flags().IntVarP(&port, "port", "p", 8001, "Raft port") - nodeStartCmd.Flags().IntVar(&adminPort, "admin-port", 9001, "Admin API port") - nodeStartCmd.Flags().IntVar(&monitorPort, "monitor-port", 9002, "Monitor port") - nodeStartCmd.Flags().IntVar(&dashboardPort, "dashboard-port", 9003, "Dashboard port") - nodeStartCmd.Flags().StringVarP(&joinAddr, "join", "j", "", "Address of node to join") - nodeStartCmd.Flags().BoolVarP(&bootstrapCluster, "bootstrap", "b", false, "Bootstrap new cluster") - nodeStartCmd.MarkFlagRequired("node-id") - - // Multi-directional replication command flags - nodeMultiCmd.Flags().StringVarP(&liveNodeID, "node-id", "n", "", "Node ID (required)") - nodeMultiCmd.Flags().IntVarP(&livePort, "port", "p", 8001, "Port") - nodeMultiCmd.Flags().StringVarP(&liveJoin, "join", "j", "", "Address of node to join") - nodeMultiCmd.MarkFlagRequired("node-id") -} - -func runNodeStart(cmd *cobra.Command, args []string) error { - // Get global flags - logLevel, _ := cmd.Flags().GetString("log-level") - dataDir, _ := cmd.Flags().GetString("data-dir") - - // Create configuration - config := AppConfig{ - NodeID: nodeID, - Port: port, - AdminPort: adminPort, - MonitorPort: monitorPort, - DashboardPort: dashboardPort, - JoinAddr: joinAddr, - DataDir: filepath.Join(dataDir, nodeID), - LogLevel: logLevel, - BootstrapCluster: bootstrapCluster, - } - - // Create and start application - app, err := NewApplication(config) - if err != nil { - return fmt.Errorf("creating application: %w", err) - } - - // Start application - if err := app.Start(); err != nil { - return fmt.Errorf("starting application: %w", err) - } - - // Setup graceful shutdown - setupSignalHandling(app) - - // Keep running - select {} -} - -func runNodeMulti(cmd *cobra.Command, args []string) error { - // Get global flags - logLevel, _ := cmd.Flags().GetString("log-level") - dataDir, _ := cmd.Flags().GetString("data-dir") - - // Set up logging - logger := logrus.New() - level, err := logrus.ParseLevel(logLevel) - if err != nil { - level = logrus.InfoLevel - } - logger.SetLevel(level) - logger.Infof("Starting multi-directional replication node %s on port %d", liveNodeID, livePort) - - // Setup data directory - nodeDataDir := filepath.Join(dataDir, liveNodeID) - if err := os.MkdirAll(nodeDataDir, 0750); err != nil { - return fmt.Errorf("creating data directory: %w", err) - } - - // Start multi-directional replication node - return runMultiReplication(liveNodeID, livePort, liveJoin, nodeDataDir, logger) -} - -// AppConfig holds all configuration for the application. -type AppConfig struct { - NodeID string - Port int - AdminPort int - MonitorPort int - DashboardPort int - JoinAddr string - DataDir string - LogLevel string - BootstrapCluster bool -} - -// validateConfig validates the application configuration. -func validateConfig(cfg AppConfig) error { - if cfg.DataDir == "" { - return errors.New("data directory cannot be empty") - } - if cfg.NodeID == "" { - return errors.New("node ID cannot be empty") - } - if cfg.Port <= 0 { - return errors.New("port must be positive") - } - if cfg.AdminPort <= 0 { - return errors.New("admin port must be positive") - } - if cfg.MonitorPort <= 0 { - return errors.New("monitor port must be positive") - } - if cfg.DashboardPort <= 0 { - return errors.New("dashboard port must be positive") - } - return nil -} - -// Application represents the main application with all components. -type Application struct { - config AppConfig - logger *logrus.Logger - raftManager *storage.RaftManager - stateManager *watcher.DefaultStateManager - fileWatcher *watcher.FileWatcher - adminServer *admin.Server - monitor *monitoring.Monitor - dashboard *monitoring.Dashboard -} - -// NewApplication creates a new application instance with all components. -func NewApplication(cfg AppConfig) (*Application, error) { - // Validate configuration - if err := validateConfig(cfg); err != nil { - return nil, fmt.Errorf("invalid configuration: %w", err) - } - - // Setup logger - logger := logrus.New() - level, err := logrus.ParseLevel(cfg.LogLevel) - if err != nil { - level = logrus.InfoLevel - } - logger.SetLevel(level) - logger.SetFormatter(&logrus.TextFormatter{ - FullTimestamp: true, - ForceColors: true, - }) - - // Create data directory - if err := os.MkdirAll(cfg.DataDir, 0750); err != nil { - return nil, fmt.Errorf("creating data directory: %w", err) - } - - app := &Application{ - config: cfg, - logger: logger, - } - - // Initialize components - if err := app.initializeComponents(); err != nil { - return nil, fmt.Errorf("initializing components: %w", err) - } - - return app, nil -} - -// initializeComponents sets up all application components. -func (app *Application) initializeComponents() error { - var err error - - // Initialize Raft manager - bindAddr := fmt.Sprintf("127.0.0.1:%d", app.config.Port) - app.raftManager, err = storage.NewRaftManager( - app.config.NodeID, - app.config.DataDir, - bindAddr, - ) - if err != nil { - return fmt.Errorf("creating raft manager: %w", err) - } - - // Initialize state manager - app.stateManager = watcher.NewDefaultStateManager() - - // Access the raft instance through the manager for admin server - raftInstance := app.getRaftInstance() - - // Initialize admin server - app.adminServer = admin.NewServer( - raftInstance, - app.config.AdminPort, - app.logger, - ) - - // Initialize monitoring - app.monitor = monitoring.NewMonitor( - app.config.NodeID, - raftInstance, - app.logger, - ) - - // Initialize dashboard - app.dashboard = monitoring.NewDashboard(app.monitor, app.logger) - - // Initialize file watcher with simplified approach - watcherConfig := watcher.Config{ - DataDir: app.config.DataDir, - NodeID: app.config.NodeID, - Logger: app.logger, - ApplyTimeout: 5 * time.Second, - } - - app.fileWatcher, err = watcher.NewFileWatcher( - watcherConfig, - &raftWrapper{app.raftManager}, - app.stateManager, - &forwarderWrapper{}, - ) - if err != nil { - return fmt.Errorf("creating file watcher: %w", err) - } - - return nil -} - -// getRaftInstance provides access to the underlying raft instance -func (app *Application) getRaftInstance() *raft.Raft { - if app.raftManager == nil { - return nil - } - return app.raftManager.GetRaft() -} - -// raftWrapper adapts RaftManager to the watcher.RaftApplier interface. -type raftWrapper struct { - rm *storage.RaftManager -} - -func (rw *raftWrapper) Apply(data []byte, timeout time.Duration) raft.ApplyFuture { - if rw.rm == nil { - return nil - } - return rw.rm.GetRaft().Apply(data, timeout) -} - -func (rw *raftWrapper) State() raft.RaftState { - if rw.rm == nil { - return raft.Shutdown - } - return rw.rm.State() -} - -func (rw *raftWrapper) Leader() raft.ServerAddress { - if rw.rm == nil { - return "" - } - return rw.rm.Leader() -} - -// forwarderWrapper implements the watcher.LeaderForwarder interface. -type forwarderWrapper struct{} - -func (fw *forwarderWrapper) ForwardToLeader(leaderAddr string, cmd watcher.Command) error { - adminCmd := admin.Command{ - Op: cmd.Op, - Path: cmd.Path, - Data: cmd.Data, - Hash: cmd.Hash, - NodeID: cmd.NodeID, - Sequence: cmd.Sequence, - } - return admin.ForwardToLeader(leaderAddr, adminCmd) -} - -// Start starts all application components. -func (app *Application) Start() error { - app.logger.Infof("🚀 Starting Pickbox node %s", app.config.NodeID) - - // Start Raft cluster - if err := app.startRaftCluster(); err != nil { - return fmt.Errorf("starting raft cluster: %w", err) - } - - // Start admin server - if err := app.adminServer.Start(); err != nil { - return fmt.Errorf("starting admin server: %w", err) - } - - // Start monitoring - app.monitor.StartHTTPServer(app.config.MonitorPort) - app.monitor.LogMetrics(30 * time.Second) - - // Start dashboard - app.dashboard.StartDashboardServer(app.config.DashboardPort) - - // Start file watcher - if err := app.fileWatcher.Start(); err != nil { - return fmt.Errorf("starting file watcher: %w", err) - } - - // Wait for leadership and join cluster if needed - go app.handleClusterMembership() - - app.logger.Infof("✅ Node %s started successfully", app.config.NodeID) - app.logAccessURLs() - - return nil -} - -// startRaftCluster initializes the Raft cluster. -func (app *Application) startRaftCluster() error { - if app.config.BootstrapCluster { - app.logger.Info("🏗️ Bootstrapping new cluster...") - - // Create server configuration for bootstrap - server := raft.Server{ - ID: raft.ServerID(app.config.NodeID), - Address: raft.ServerAddress(fmt.Sprintf("127.0.0.1:%d", app.config.Port)), - } - - if err := app.raftManager.BootstrapCluster([]raft.Server{server}); err != nil { - return fmt.Errorf("bootstrapping cluster: %w", err) - } - - app.logger.Infof("🏗️ Cluster bootstrapped with node %s", app.config.NodeID) - } - - return nil -} - -// handleClusterMembership handles joining cluster if join address is provided. -func (app *Application) handleClusterMembership() { - if app.config.JoinAddr == "" { - return - } - - app.logger.Info("⏳ Waiting for cluster membership...") - - // Wait briefly for the node to be ready - time.Sleep(2 * time.Second) - - // Derive admin address from Raft address - leaderAdminAddr := app.deriveAdminAddress(app.config.JoinAddr) - nodeAddr := fmt.Sprintf("127.0.0.1:%d", app.config.Port) - - // Try to join the cluster - if err := app.requestJoinCluster(leaderAdminAddr, app.config.NodeID, nodeAddr); err != nil { - app.logger.Errorf("❌ Failed to join cluster: %v", err) - return - } - - app.logger.Infof("🤝 Successfully joined cluster via %s", leaderAdminAddr) - - // Monitor leadership changes - go app.monitorLeadership() -} - -// deriveAdminAddress converts a Raft address to an admin address. -func (app *Application) deriveAdminAddress(raftAddr string) string { - parts := strings.Split(raftAddr, ":") - if len(parts) != 2 { - return "127.0.0.1:9001" // Fallback to default admin port - } - - raftPort, err := strconv.Atoi(parts[1]) - if err != nil { - return "127.0.0.1:9001" // Fallback to default admin port - } - - // Assume admin port is raftPort + 1000 - adminPort := raftPort + 1000 - return fmt.Sprintf("%s:%d", parts[0], adminPort) -} - -// requestJoinCluster requests to join the cluster via admin API. -func (app *Application) requestJoinCluster(leaderAdminAddr, nodeID, nodeAddr string) error { - return admin.RequestJoinCluster(leaderAdminAddr, nodeID, nodeAddr) -} - -// monitorLeadership monitors leadership changes and logs them. -func (app *Application) monitorLeadership() { - ticker := time.NewTicker(10 * time.Second) - defer ticker.Stop() - - var lastLeader raft.ServerAddress - - for range ticker.C { - currentLeader := app.raftManager.Leader() - if currentLeader != lastLeader { - if currentLeader == "" { - app.logger.Warn("👑 No leader elected") - } else { - app.logger.Infof("👑 Leader: %s", currentLeader) - } - lastLeader = currentLeader - } - } -} - -// logAccessURLs logs the access URLs for the various services. -func (app *Application) logAccessURLs() { - app.logger.Info("📊 Access URLs:") - app.logger.Infof(" Admin API: http://localhost:%d", app.config.AdminPort) - app.logger.Infof(" Monitoring: http://localhost:%d/metrics", app.config.MonitorPort) - app.logger.Infof(" Dashboard: http://localhost:%d", app.config.DashboardPort) - app.logger.Infof(" Data Directory: %s", app.config.DataDir) -} - -// Stop stops all application components. -func (app *Application) Stop() error { - app.logger.Info("🛑 Stopping Pickbox node...") - - // Stop file watcher - if app.fileWatcher != nil { - app.fileWatcher.Stop() - } - - // Note: Admin server doesn't have a Stop method, it will be cleaned up - // when the application exits - - // Stop Raft - if app.raftManager != nil { - app.raftManager.Shutdown() - } - - app.logger.Info("✅ Node stopped successfully") - return nil -} - -// setupSignalHandling sets up graceful shutdown on SIGINT and SIGTERM. -func setupSignalHandling(app *Application) { - c := make(chan os.Signal, 1) - signal.Notify(c, os.Interrupt, syscall.SIGTERM) - - go func() { - <-c - app.logger.Info("🛑 Received shutdown signal...") - app.Stop() - os.Exit(0) - }() -} diff --git a/cmd/pickbox/node_test.go b/cmd/pickbox/node_test.go deleted file mode 100644 index 46b4fd8..0000000 --- a/cmd/pickbox/node_test.go +++ /dev/null @@ -1,497 +0,0 @@ -package main - -import ( - "testing" - - "github.com/hashicorp/raft" - "github.com/sirupsen/logrus" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// Test for AppConfig validation -func TestAppConfig_Validation(t *testing.T) { - tests := []struct { - name string - config AppConfig - wantErr bool - }{ - { - name: "valid config", - config: AppConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: false, - }, - { - name: "invalid config - empty data dir", - config: AppConfig{ - DataDir: "", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - empty node ID", - config: AppConfig{ - DataDir: "/tmp/test", - NodeID: "", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - zero port", - config: AppConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 0, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - zero admin port", - config: AppConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 0, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - zero monitor port", - config: AppConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 0, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - { - name: "invalid config - zero dashboard port", - config: AppConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 0, - LogLevel: "info", - }, - wantErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := validateConfig(tt.config) - if tt.wantErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - } - }) - } -} - -// Test for Application creation -func TestNewApplication(t *testing.T) { - tests := []struct { - name string - config AppConfig - wantErr bool - }{ - { - name: "invalid config should fail", - config: AppConfig{ - DataDir: "", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - }, - wantErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - app, err := NewApplication(tt.config) - if tt.wantErr { - assert.Error(t, err) - assert.Nil(t, app) - } else { - assert.NoError(t, err) - assert.NotNil(t, app) - } - }) - } -} - -// Test deriveAdminAddress function -func TestDeriveAdminAddress(t *testing.T) { - if testing.Short() { - t.Skip("Skipping integration test in short mode") - } - - // Create a temporary application for testing - tempDir := t.TempDir() - config := AppConfig{ - DataDir: tempDir, - NodeID: "test-node", - Port: 8401, - AdminPort: 9401, - MonitorPort: 8480, - DashboardPort: 8490, - LogLevel: "error", - } - - app, err := NewApplication(config) - require.NoError(t, err) - require.NotNil(t, app) - defer app.Stop() - - tests := []struct { - name string - raftAddr string - want string - }{ - { - name: "valid address", - raftAddr: "127.0.0.1:8001", - want: "127.0.0.1:9001", - }, - { - name: "different port", - raftAddr: "127.0.0.1:8002", - want: "127.0.0.1:9002", - }, - { - name: "different host", - raftAddr: "192.168.1.1:8001", - want: "192.168.1.1:9001", - }, - { - name: "invalid address", - raftAddr: "invalid", - want: "127.0.0.1:9001", - }, - { - name: "invalid port", - raftAddr: "127.0.0.1:invalid", - want: "127.0.0.1:9001", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := app.deriveAdminAddress(tt.raftAddr) - assert.Equal(t, tt.want, got) - }) - } -} - -// Test raftWrapper -func TestRaftWrapper(t *testing.T) { - // Create a mock raft manager for testing - // Note: This is a simplified test as creating a full raft manager is complex - t.Run("wrapper interface", func(t *testing.T) { - // Test that raftWrapper implements the expected interface - var wrapper interface{} = &raftWrapper{} - - // Check that it has the expected methods - assert.NotNil(t, wrapper) - }) -} - -// Test forwarderWrapper -func TestForwarderWrapper(t *testing.T) { - wrapper := &forwarderWrapper{} - - // Test that wrapper implements the expected interface - assert.NotNil(t, wrapper) -} - -// Test raft wrapper implementations -func TestNodeRaftWrapperImplementations(t *testing.T) { - t.Run("raftWrapper methods", func(t *testing.T) { - // Test that raftWrapper has the expected methods - wrapper := &raftWrapper{} - assert.NotNil(t, wrapper) - - // Test nil safety - these should not panic when rm is nil - assert.Equal(t, raft.Shutdown, wrapper.State()) - assert.Equal(t, raft.ServerAddress(""), wrapper.Leader()) - }) - - t.Run("forwarderWrapper methods", func(t *testing.T) { - wrapper := &forwarderWrapper{} - assert.NotNil(t, wrapper) - }) -} - -// Benchmark tests -func BenchmarkAppConfigValidation(b *testing.B) { - config := AppConfig{ - DataDir: "/tmp/test", - NodeID: "node1", - Port: 8000, - AdminPort: 9000, - MonitorPort: 8080, - DashboardPort: 8090, - LogLevel: "info", - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - validateConfig(config) - } -} - -func BenchmarkDeriveAdminAddress(b *testing.B) { - tempDir := b.TempDir() - config := AppConfig{ - DataDir: tempDir, - NodeID: "test-node", - Port: 8501, - AdminPort: 9501, - MonitorPort: 8580, - DashboardPort: 8590, - LogLevel: "error", - } - - app, err := NewApplication(config) - require.NoError(b, err) - require.NotNil(b, app) - defer app.Stop() - - addr := "127.0.0.1:8001" - - b.ResetTimer() - for i := 0; i < b.N; i++ { - app.deriveAdminAddress(addr) - } -} - -// Integration tests -func TestApplicationIntegration(t *testing.T) { - if testing.Short() { - t.Skip("Skipping integration test in short mode") - } - - tempDir := t.TempDir() - - config := AppConfig{ - DataDir: tempDir, - NodeID: "test-node", - Port: 8601, - AdminPort: 9601, - MonitorPort: 8680, - DashboardPort: 8690, - LogLevel: "error", // Use error level to reduce test output - BootstrapCluster: true, - } - - t.Run("application lifecycle", func(t *testing.T) { - app, err := NewApplication(config) - require.NoError(t, err) - require.NotNil(t, app) - - // Test components initialization - assert.NotNil(t, app.config) - assert.NotNil(t, app.logger) - assert.NotNil(t, app.raftManager) - assert.NotNil(t, app.stateManager) - assert.NotNil(t, app.fileWatcher) - assert.NotNil(t, app.adminServer) - assert.NotNil(t, app.monitor) - assert.NotNil(t, app.dashboard) - - // Test that data directory was created - assert.DirExists(t, tempDir) - - // Test stop functionality - err = app.Stop() - assert.NoError(t, err) - }) -} - -// Test edge cases and error handling -func TestApplicationErrorHandling(t *testing.T) { - if testing.Short() { - t.Skip("Skipping integration test in short mode") - } - - t.Run("invalid data directory", func(t *testing.T) { - config := AppConfig{ - DataDir: "/invalid/path/that/does/not/exist/and/cannot/be/created", - NodeID: "test-node", - Port: 8701, - AdminPort: 9701, - MonitorPort: 8780, - DashboardPort: 8790, - LogLevel: "error", - BootstrapCluster: true, - } - - app, err := NewApplication(config) - assert.Error(t, err) - assert.Nil(t, app) - }) - - t.Run("invalid log level", func(t *testing.T) { - tempDir := t.TempDir() - config := AppConfig{ - DataDir: tempDir, - NodeID: "test-node", - Port: 8702, - AdminPort: 9702, - MonitorPort: 8781, - DashboardPort: 8791, - LogLevel: "invalid-level", - BootstrapCluster: true, - } - - app, err := NewApplication(config) - require.NoError(t, err) - require.NotNil(t, app) - - // Should default to info level - assert.Equal(t, logrus.InfoLevel, app.logger.Level) - - app.Stop() - }) -} - -// Test setupSignalHandling function -func TestSetupSignalHandling(t *testing.T) { - if testing.Short() { - t.Skip("Skipping integration test in short mode") - } - - tempDir := t.TempDir() - config := AppConfig{ - DataDir: tempDir, - NodeID: "test-node", - Port: 8801, - AdminPort: 9801, - MonitorPort: 8880, - DashboardPort: 8890, - LogLevel: "error", - BootstrapCluster: true, - } - - app, err := NewApplication(config) - require.NoError(t, err) - require.NotNil(t, app) - - // Test that setupSignalHandling doesn't panic - assert.NotPanics(t, func() { - setupSignalHandling(app) - }) - - app.Stop() -} - -// Test getRaftInstance function -func TestGetRaftInstance(t *testing.T) { - if testing.Short() { - t.Skip("Skipping integration test in short mode") - } - - tempDir := t.TempDir() - config := AppConfig{ - DataDir: tempDir, - NodeID: "test-node", - Port: 8901, - AdminPort: 9901, - MonitorPort: 8980, - DashboardPort: 8990, - LogLevel: "error", - BootstrapCluster: true, - } - - app, err := NewApplication(config) - require.NoError(t, err) - require.NotNil(t, app) - defer app.Stop() - - // Test getRaftInstance - raftInstance := app.getRaftInstance() - assert.NotNil(t, raftInstance) - - // Test with nil raftManager - app.raftManager = nil - raftInstance = app.getRaftInstance() - assert.Nil(t, raftInstance) -} - -// Test Application methods -func TestApplicationMethods(t *testing.T) { - if testing.Short() { - t.Skip("Skipping integration test in short mode") - } - - tempDir := t.TempDir() - config := AppConfig{ - DataDir: tempDir, - NodeID: "test-node", - Port: 9201, // Changed from 9101 to avoid conflict with multi-replication test - AdminPort: 10201, - MonitorPort: 9280, - DashboardPort: 9290, - LogLevel: "error", - BootstrapCluster: true, - } - - app, err := NewApplication(config) - require.NoError(t, err) - require.NotNil(t, app) - defer app.Stop() - - t.Run("logAccessURLs", func(t *testing.T) { - // Test that logAccessURLs doesn't panic - assert.NotPanics(t, func() { - app.logAccessURLs() - }) - }) - - t.Run("initializeComponents", func(t *testing.T) { - // Test that initializeComponents has already been called - assert.NotNil(t, app.raftManager) - assert.NotNil(t, app.stateManager) - assert.NotNil(t, app.fileWatcher) - assert.NotNil(t, app.adminServer) - assert.NotNil(t, app.monitor) - assert.NotNil(t, app.dashboard) - }) -} diff --git a/cmd/pickbox/script.go b/cmd/pickbox/script.go deleted file mode 100644 index b53fc49..0000000 --- a/cmd/pickbox/script.go +++ /dev/null @@ -1,145 +0,0 @@ -package main - -import ( - "fmt" - "os" - "os/exec" - "strconv" - "time" - - "github.com/spf13/cobra" -) - -var scriptCmd = &cobra.Command{ - Use: "script", - Short: "Run common cluster scripts", - Long: `Run common cluster scripts for testing and demonstration`, -} - -var scriptDemo3Cmd = &cobra.Command{ - Use: "demo-3-nodes", - Short: "Demo script for 3-node cluster", - Long: `Demonstrates setting up a 3-node cluster with bootstrap and joining`, - RunE: runDemo3Nodes, -} - -var scriptCleanupCmd = &cobra.Command{ - Use: "cleanup", - Short: "Clean up data directories", - Long: `Clean up data directories from previous runs`, - RunE: runCleanup, -} - -func init() { - rootCmd.AddCommand(scriptCmd) - scriptCmd.AddCommand(scriptDemo3Cmd) - scriptCmd.AddCommand(scriptCleanupCmd) -} - -func runDemo3Nodes(cmd *cobra.Command, args []string) error { - fmt.Println("🚀 Starting 3-node cluster demo...") - - // Get data directory from global flags - dataDir, _ := cmd.Flags().GetString("data-dir") - - // Clean up first - if err := cleanup(dataDir); err != nil { - fmt.Printf("Warning: cleanup failed: %v\n", err) - } - - fmt.Println("📋 Starting nodes...") - - // Start node1 as bootstrap - fmt.Println("Starting node1 (bootstrap)...") - if err := startNodeInBackground("node1", 8001, 9001, "", true); err != nil { - return fmt.Errorf("starting node1: %w", err) - } - - // Wait for node1 to be ready - time.Sleep(3 * time.Second) - - // Start node2 - fmt.Println("Starting node2...") - if err := startNodeInBackground("node2", 8002, 9002, "127.0.0.1:8001", false); err != nil { - return fmt.Errorf("starting node2: %w", err) - } - - // Start node3 - fmt.Println("Starting node3...") - if err := startNodeInBackground("node3", 8003, 9003, "127.0.0.1:8001", false); err != nil { - return fmt.Errorf("starting node3: %w", err) - } - - fmt.Println("✅ 3-node cluster started!") - fmt.Println("📊 Access URLs:") - fmt.Println(" Node1 Admin: http://localhost:9001") - fmt.Println(" Node2 Admin: http://localhost:9002") - fmt.Println(" Node3 Admin: http://localhost:9003") - fmt.Println(" Node1 Dashboard: http://localhost:9003") - fmt.Println(" Node2 Dashboard: http://localhost:9006") - fmt.Println(" Node3 Dashboard: http://localhost:9009") - fmt.Println("📁 Data directories:") - fmt.Println(" Node1: data/node1") - fmt.Println(" Node2: data/node2") - fmt.Println(" Node3: data/node3") - fmt.Println("🛑 To stop all nodes, run: pkill pickbox") - - return nil -} - -func runCleanup(cmd *cobra.Command, args []string) error { - // Get data directory from global flags - dataDir, _ := cmd.Flags().GetString("data-dir") - - fmt.Println("🧹 Cleaning up data directories...") - return cleanup(dataDir) -} - -func cleanup(dataDir string) error { - if err := os.RemoveAll(dataDir); err != nil { - return fmt.Errorf("removing data directory: %w", err) - } - - fmt.Println("✅ Cleanup completed") - return nil -} - -func startNodeInBackground(nodeID string, port, adminPort int, joinAddr string, bootstrap bool) error { - // Build command arguments - args := []string{ - "node", "start", - "--node-id", nodeID, - "--port", strconv.Itoa(port), - "--admin-port", strconv.Itoa(adminPort), - "--monitor-port", strconv.Itoa(adminPort + 1), - "--dashboard-port", strconv.Itoa(adminPort + 2), - } - - if bootstrap { - args = append(args, "--bootstrap") - } - - if joinAddr != "" { - args = append(args, "--join", joinAddr) - } - - // Use hardcoded binary path for security - executable := "./bin/pickbox" - - // Validate that the binary exists - if _, err := os.Stat(executable); os.IsNotExist(err) { - return fmt.Errorf("pickbox binary not found at %s - please run 'make build' first", executable) - } - - // Start the command in background - cmd := exec.Command(executable, args...) - cmd.Dir = "." // Set working directory to project root - - // Start the process - if err := cmd.Start(); err != nil { - return fmt.Errorf("starting node %s: %w", nodeID, err) - } - - fmt.Printf("✅ Node %s started (PID: %d)\n", nodeID, cmd.Process.Pid) - return nil -} diff --git a/cmd/pickbox/script_test.go b/cmd/pickbox/script_test.go deleted file mode 100644 index f1bceb0..0000000 --- a/cmd/pickbox/script_test.go +++ /dev/null @@ -1,463 +0,0 @@ -package main - -import ( - "os" - "path/filepath" - "testing" - - "github.com/spf13/cobra" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestScriptCommand(t *testing.T) { - tests := []struct { - name string - expectedUse string - expectedShort string - }{ - { - name: "script command properties", - expectedUse: "script", - expectedShort: "Run common cluster scripts", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expectedUse, scriptCmd.Use) - assert.Equal(t, tt.expectedShort, scriptCmd.Short) - assert.NotEmpty(t, scriptCmd.Long) - }) - } -} - -func TestScriptDemo3Command(t *testing.T) { - tests := []struct { - name string - expectedUse string - expectedShort string - }{ - { - name: "demo-3-nodes command properties", - expectedUse: "demo-3-nodes", - expectedShort: "Demo script for 3-node cluster", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expectedUse, scriptDemo3Cmd.Use) - assert.Equal(t, tt.expectedShort, scriptDemo3Cmd.Short) - assert.NotEmpty(t, scriptDemo3Cmd.Long) - assert.NotNil(t, scriptDemo3Cmd.RunE) - }) - } -} - -func TestScriptCleanupCommand(t *testing.T) { - tests := []struct { - name string - expectedUse string - expectedShort string - }{ - { - name: "cleanup command properties", - expectedUse: "cleanup", - expectedShort: "Clean up data directories", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expectedUse, scriptCleanupCmd.Use) - assert.Equal(t, tt.expectedShort, scriptCleanupCmd.Short) - assert.NotEmpty(t, scriptCleanupCmd.Long) - assert.NotNil(t, scriptCleanupCmd.RunE) - }) - } -} - -func TestScriptCommandInitialization(t *testing.T) { - // Test that script command is properly added to root - found := false - for _, cmd := range rootCmd.Commands() { - if cmd.Use == "script" { - found = true - break - } - } - assert.True(t, found, "script command should be added to root command") - - // Test that subcommands are properly added to script command - subcommands := scriptCmd.Commands() - expectedSubcommands := []string{"demo-3-nodes", "cleanup"} - - for _, expected := range expectedSubcommands { - found := false - for _, cmd := range subcommands { - if cmd.Use == expected { - found = true - break - } - } - assert.True(t, found, "subcommand %s should be added to script command", expected) - } -} - -func TestCleanupFunction(t *testing.T) { - // Create a temporary directory for testing - tempDir, err := os.MkdirTemp("", "pickbox_test_") - require.NoError(t, err) - - // Create some test files/directories - testFile := filepath.Join(tempDir, "test.txt") - testSubDir := filepath.Join(tempDir, "subdir") - - err = os.WriteFile(testFile, []byte("test data"), 0644) - require.NoError(t, err) - - err = os.MkdirAll(testSubDir, 0755) - require.NoError(t, err) - - // Verify files exist before cleanup - assert.FileExists(t, testFile) - assert.DirExists(t, testSubDir) - - // Test cleanup function - err = cleanup(tempDir) - assert.NoError(t, err) - - // Verify directory was removed - assert.NoFileExists(t, tempDir) -} - -func TestCleanupFunctionWithNonExistentDirectory(t *testing.T) { - // Test cleanup with non-existent directory - nonExistentDir := "/tmp/non_existent_pickbox_test_dir_12345" - - err := cleanup(nonExistentDir) - assert.NoError(t, err, "cleanup should not error on non-existent directory") -} - -func TestRunCleanupCommand(t *testing.T) { - // Create a temporary directory for testing - tempDir, err := os.MkdirTemp("", "pickbox_cleanup_test_") - require.NoError(t, err) - - // Create test content - testFile := filepath.Join(tempDir, "test.txt") - err = os.WriteFile(testFile, []byte("test"), 0644) - require.NoError(t, err) - - // Create a test command with the temp directory - testCmd := &cobra.Command{Use: "test"} - testCmd.Flags().String("data-dir", tempDir, "test data dir") - - // Test the runCleanup function - err = runCleanup(testCmd, []string{}) - assert.NoError(t, err) - - // Verify directory was cleaned - assert.NoFileExists(t, tempDir) -} - -func TestRunDemo3NodesWithoutBinary(t *testing.T) { - // Create a temporary directory for testing - tempDir, err := os.MkdirTemp("", "pickbox_demo_test_") - require.NoError(t, err) - defer os.RemoveAll(tempDir) - - // Create a test command - testCmd := &cobra.Command{Use: "test"} - testCmd.Flags().String("data-dir", tempDir, "test data dir") - - // Test the runDemo3Nodes function - this should fail because binary doesn't exist - err = runDemo3Nodes(testCmd, []string{}) - assert.Error(t, err, "should error when pickbox binary is not found") - assert.Contains(t, err.Error(), "pickbox binary not found") -} - -func TestStartNodeInBackgroundValidation(t *testing.T) { - tests := []struct { - name string - nodeID string - port int - adminPort int - joinAddr string - bootstrap bool - expectErr bool - errContains string - }{ - { - name: "valid bootstrap node", - nodeID: "node1", - port: 8001, - adminPort: 9001, - joinAddr: "", - bootstrap: true, - expectErr: true, // Will fail because binary doesn't exist - errContains: "pickbox binary not found", - }, - { - name: "valid joining node", - nodeID: "node2", - port: 8002, - adminPort: 9002, - joinAddr: "127.0.0.1:8001", - bootstrap: false, - expectErr: true, // Will fail because binary doesn't exist - errContains: "pickbox binary not found", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := startNodeInBackground(tt.nodeID, tt.port, tt.adminPort, tt.joinAddr, tt.bootstrap) - - if tt.expectErr { - assert.Error(t, err) - if tt.errContains != "" { - assert.Contains(t, err.Error(), tt.errContains) - } - } else { - assert.NoError(t, err) - } - }) - } -} - -func TestStartNodeInBackgroundCommandArgs(t *testing.T) { - // Since we can't actually start nodes in tests, we'll test the argument building logic - // by verifying the function handles different parameter combinations correctly - - tests := []struct { - name string - nodeID string - port int - adminPort int - joinAddr string - bootstrap bool - }{ - { - name: "bootstrap node", - nodeID: "node1", - port: 8001, - adminPort: 9001, - joinAddr: "", - bootstrap: true, - }, - { - name: "joining node", - nodeID: "node2", - port: 8002, - adminPort: 9002, - joinAddr: "127.0.0.1:8001", - bootstrap: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // We can't test the actual execution, but we can verify error handling - err := startNodeInBackground(tt.nodeID, tt.port, tt.adminPort, tt.joinAddr, tt.bootstrap) - - // Should error because binary doesn't exist - assert.Error(t, err) - assert.Contains(t, err.Error(), "pickbox binary not found") - }) - } -} - -func TestScriptCommandUsage(t *testing.T) { - // Test script command usage - usage := scriptCmd.UsageString() - assert.Contains(t, usage, "script") - assert.Contains(t, usage, "Available Commands") -} - -func TestDemo3NodesCommandUsage(t *testing.T) { - // Test demo-3-nodes command usage - usage := scriptDemo3Cmd.UsageString() - assert.Contains(t, usage, "demo-3-nodes") -} - -func TestCleanupCommandUsage(t *testing.T) { - // Test cleanup command usage - usage := scriptCleanupCmd.UsageString() - assert.Contains(t, usage, "cleanup") -} - -func TestScriptCommandHelp(t *testing.T) { - // Test that help doesn't panic - assert.NotPanics(t, func() { - scriptCmd.SetArgs([]string{"--help"}) - scriptCmd.Execute() - }) -} - -func TestDemo3NodesCommandHelp(t *testing.T) { - // Test that help doesn't panic - assert.NotPanics(t, func() { - scriptDemo3Cmd.SetArgs([]string{"--help"}) - scriptDemo3Cmd.Execute() - }) -} - -func TestCleanupCommandHelp(t *testing.T) { - // Test that help doesn't panic - assert.NotPanics(t, func() { - scriptCleanupCmd.SetArgs([]string{"--help"}) - scriptCleanupCmd.Execute() - }) -} - -func TestCleanupWithPermissionError(t *testing.T) { - // Skip this test on systems where we can't create permission-restricted directories - if os.Getuid() == 0 { - t.Skip("Skipping permission test when running as root") - } - - // Create a temporary directory - tempDir, err := os.MkdirTemp("", "pickbox_perm_test_") - require.NoError(t, err) - defer os.RemoveAll(tempDir) - - // Create a subdirectory - subDir := filepath.Join(tempDir, "subdir") - err = os.MkdirAll(subDir, 0755) - require.NoError(t, err) - - // Make parent directory read-only (this may not work on all systems) - err = os.Chmod(tempDir, 0444) - if err != nil { - t.Skip("Cannot change directory permissions on this system") - } - defer os.Chmod(tempDir, 0755) // Restore permissions for cleanup - - // Try to cleanup - this should handle permission errors gracefully - err = cleanup(subDir) - // The result depends on the system - some systems allow deletion despite read-only parent - // We just ensure it doesn't panic - assert.NotNil(t, err) // Might be nil or an error, both are valid -} - -func TestRunDemo3NodesDataDirFlag(t *testing.T) { - // Test that data-dir flag is properly handled - tempDir, err := os.MkdirTemp("", "pickbox_datadir_test_") - require.NoError(t, err) - defer os.RemoveAll(tempDir) - - // Create a test command with custom data directory - testCmd := &cobra.Command{Use: "test"} - testCmd.Flags().String("data-dir", tempDir, "test data dir") - - // This should fail at the binary check, but first it should process the data-dir flag - err = runDemo3Nodes(testCmd, []string{}) - assert.Error(t, err) - assert.Contains(t, err.Error(), "pickbox binary not found") - - // The temp directory should have been cleaned up during the process - // (cleanup is called before attempting to start nodes) -} - -func TestScriptCommandStructure(t *testing.T) { - // Test command structure - assert.NotEmpty(t, scriptCmd.Use) - assert.NotEmpty(t, scriptCmd.Short) - assert.NotEmpty(t, scriptCmd.Long) - - // Test subcommands structure - assert.NotEmpty(t, scriptDemo3Cmd.Use) - assert.NotEmpty(t, scriptDemo3Cmd.Short) - assert.NotEmpty(t, scriptDemo3Cmd.Long) - assert.NotNil(t, scriptDemo3Cmd.RunE) - - assert.NotEmpty(t, scriptCleanupCmd.Use) - assert.NotEmpty(t, scriptCleanupCmd.Short) - assert.NotEmpty(t, scriptCleanupCmd.Long) - assert.NotNil(t, scriptCleanupCmd.RunE) -} - -func TestCleanupEmptyPath(t *testing.T) { - // Test cleanup with empty path - err := cleanup("") - assert.NoError(t, err, "cleanup with empty path should not error") -} - -func TestCleanupRelativePath(t *testing.T) { - // Test cleanup with relative path - tempDir, err := os.MkdirTemp("", "pickbox_rel_test_") - require.NoError(t, err) - - // Change to temp directory and create a relative path - originalWd, err := os.Getwd() - require.NoError(t, err) - defer os.Chdir(originalWd) - - parentDir := filepath.Dir(tempDir) - err = os.Chdir(parentDir) - require.NoError(t, err) - - relPath := filepath.Base(tempDir) - - // Test cleanup with relative path - err = cleanup(relPath) - assert.NoError(t, err) - - // Verify directory was removed - assert.NoFileExists(t, filepath.Join(parentDir, relPath)) -} - -func TestPortCalculation(t *testing.T) { - // Test the port calculation logic in startNodeInBackground - // Monitor port should be admin port + 1 - // Dashboard port should be admin port + 2 - - testCases := []struct { - adminPort int - expectedMonitor int - expectedDashboard int - }{ - {9001, 9002, 9003}, - {9002, 9003, 9004}, - {9003, 9004, 9005}, - } - - for _, tc := range testCases { - t.Run("admin_port_"+string(rune(tc.adminPort)), func(t *testing.T) { - // We can't test the actual command execution, but we can verify the function - // tries to use the correct ports by checking the error message contains the binary path issue - err := startNodeInBackground("test", 8000, tc.adminPort, "", false) - assert.Error(t, err) - assert.Contains(t, err.Error(), "pickbox binary not found") - }) - } -} - -func TestStringFormatting(t *testing.T) { - // Test that string formatting in the demo function works correctly - // This is testing the console output formatting logic - - tests := []struct { - name string - nodeID string - port int - adminPort int - }{ - {"node1", "node1", 8001, 9001}, - {"node2", "node2", 8002, 9002}, - {"node3", "node3", 8003, 9003}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Test that the node ID is valid for string formatting - assert.NotEmpty(t, tt.nodeID) - assert.NotContains(t, tt.nodeID, " ", "Node ID should not contain spaces") - assert.True(t, tt.port > 0, "Port should be positive") - assert.True(t, tt.adminPort > 0, "Admin port should be positive") - assert.NotEqual(t, tt.port, tt.adminPort, "Port and admin port should be different") - }) - } -} diff --git a/codecov.yml b/codecov.yml index 79ff718..fe8a464 100644 --- a/codecov.yml +++ b/codecov.yml @@ -6,19 +6,19 @@ codecov: coverage: precision: 2 round: down - range: "45...100" + range: "60...100" status: project: default: - target: 45% + target: 65% threshold: 1% if_no_uploads: error if_not_found: success if_ci_failed: error patch: default: - target: 25% + target: 60% threshold: 2% ignore: diff --git a/demo_n_nodes.sh b/demo_n_nodes.sh index 3445006..494dd85 100644 --- a/demo_n_nodes.sh +++ b/demo_n_nodes.sh @@ -37,8 +37,7 @@ VERBOSE=false QUICK_DEMO=false INTERACTIVE=false CLEANUP_FIRST=false -BINARY="./bin/pickbox" -BINARY_ARGS="node multi" +BINARY="cmd/multi_replication/main.go" show_help() { echo "Usage: $0 [OPTIONS]" diff --git a/examples/cluster-configs/10-node-high-ports.conf b/examples/cluster-configs/10-node-high-ports.conf index 51a6c75..ba509f9 100644 --- a/examples/cluster-configs/10-node-high-ports.conf +++ b/examples/cluster-configs/10-node-high-ports.conf @@ -1,5 +1,6 @@ -# 10-Node High Ports Configuration -# Uses higher port numbers to avoid conflicts with standard services +# Pickbox 10-Node Cluster Configuration (High Ports) +# Use with: ./scripts/cluster_manager.sh start -c 10-node-high-ports.conf +# This configuration uses higher port numbers to avoid conflicts with other services NODE_COUNT=10 BASE_PORT=18001 @@ -8,8 +9,7 @@ MONITOR_BASE_PORT=16001 DASHBOARD_PORT=18080 HOST=127.0.0.1 DATA_DIR=data -BINARY=./bin/pickbox -BINARY_ARGS="node multi" +BINARY=cmd/multi_replication/main.go # Port assignments: # node1: Raft=18001, Admin=19001, Monitor=16001 diff --git a/examples/cluster-configs/5-node-cluster.conf b/examples/cluster-configs/5-node-cluster.conf index b4eb74c..922788a 100644 --- a/examples/cluster-configs/5-node-cluster.conf +++ b/examples/cluster-configs/5-node-cluster.conf @@ -1,32 +1,39 @@ -# 5-Node Cluster Configuration -# This configuration creates a 5-node cluster with standard port allocation +# Pickbox 5-Node Cluster Configuration +# Use with: ./scripts/cluster_manager.sh start -c 5-node-cluster.conf -# Cluster size +# Number of nodes in the cluster NODE_COUNT=5 -# Port configuration +# Base port for Raft communication (nodes will use BASE_PORT, BASE_PORT+1, etc.) BASE_PORT=8001 + +# Base admin port for cluster administration (admin ports will be ADMIN_BASE_PORT, ADMIN_BASE_PORT+1, etc.) ADMIN_BASE_PORT=9001 + +# Base monitoring port for metrics and health checks MONITOR_BASE_PORT=6001 + +# Dashboard port (shared across all nodes) DASHBOARD_PORT=8080 -# Network configuration +# Host address for all nodes HOST=127.0.0.1 -# Storage configuration +# Data directory base path DATA_DIR=data -# Binary configuration -BINARY=./bin/pickbox -BINARY_ARGS="node multi" +# Binary path for the multi-replication application +BINARY=cmd/multi_replication/main.go -# Node-specific data directories will be created as: -# data/node1, data/node2, data/node3, data/node4, data/node5 - -# Port assignments will be: +# Port assignments for this configuration: # node1: Raft=8001, Admin=9001, Monitor=6001 # node2: Raft=8002, Admin=9002, Monitor=6002 # node3: Raft=8003, Admin=9003, Monitor=6003 # node4: Raft=8004, Admin=9004, Monitor=6004 # node5: Raft=8005, Admin=9005, Monitor=6005 -# Dashboard: 8080 (shared) \ No newline at end of file + +# Usage examples: +# Start cluster: ./scripts/cluster_manager.sh start -c examples/cluster-configs/5-node-cluster.conf +# Stop cluster: ./scripts/cluster_manager.sh stop -c examples/cluster-configs/5-node-cluster.conf +# Check status: ./scripts/cluster_manager.sh status -c examples/cluster-configs/5-node-cluster.conf +# Test cluster: ./scripts/tests/test_n_replication.sh -n 5 -p 8001 -a 9001 \ No newline at end of file diff --git a/examples/cluster-configs/7-node-cluster.conf b/examples/cluster-configs/7-node-cluster.conf index 1a27001..19a6685 100644 --- a/examples/cluster-configs/7-node-cluster.conf +++ b/examples/cluster-configs/7-node-cluster.conf @@ -1,4 +1,6 @@ -# 7-Node Cluster Configuration +# Pickbox 7-Node Cluster Configuration +# Use with: ./scripts/cluster_manager.sh start -c 7-node-cluster.conf + NODE_COUNT=7 BASE_PORT=8001 ADMIN_BASE_PORT=9001 @@ -6,8 +8,7 @@ MONITOR_BASE_PORT=6001 DASHBOARD_PORT=8080 HOST=127.0.0.1 DATA_DIR=data -BINARY=./bin/pickbox -BINARY_ARGS="node multi" +BINARY=cmd/multi_replication/main.go # Port assignments: # node1: Raft=8001, Admin=9001, Monitor=6001 diff --git a/go.mod b/go.mod index 6658275..6eebf76 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/addityasingh/pickbox +module github.com/aditya/pickbox go 1.21 @@ -7,7 +7,6 @@ require ( github.com/hashicorp/raft v1.6.1 github.com/hashicorp/raft-boltdb v0.0.0-20230125174641-2a8082862702 github.com/sirupsen/logrus v1.9.3 - github.com/spf13/cobra v1.9.1 github.com/stretchr/testify v1.10.0 ) @@ -21,11 +20,10 @@ require ( github.com/hashicorp/go-msgpack v0.5.5 // indirect github.com/hashicorp/go-msgpack/v2 v2.1.1 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect - github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/mattn/go-colorable v0.1.12 // indirect github.com/mattn/go-isatty v0.0.14 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/spf13/pflag v1.0.6 // indirect + github.com/stretchr/objx v0.5.2 // indirect golang.org/x/sys v0.13.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 0669b1d..92150d9 100644 --- a/go.sum +++ b/go.sum @@ -16,7 +16,6 @@ github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx2 github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I= -github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -58,8 +57,6 @@ github.com/hashicorp/raft v1.6.1 h1:v/jm5fcYHvVkL0akByAp+IDdDSzCNCGhdO6VdB56HIM= github.com/hashicorp/raft v1.6.1/go.mod h1:N1sKh6Vn47mrWvEArQgILTyng8GoDRNYlgKyK7PMjs0= github.com/hashicorp/raft-boltdb v0.0.0-20230125174641-2a8082862702 h1:RLKEcCuKcZ+qp2VlaaZsYZfLOmIiuJNpEi48Rl8u9cQ= github.com/hashicorp/raft-boltdb v0.0.0-20230125174641-2a8082862702/go.mod h1:nTakvJ4XYq45UXtn0DbwR4aU9ZdjlnIenpbs6Cd+FM0= -github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= -github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= @@ -103,17 +100,14 @@ github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= -github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= -github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= -github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= diff --git a/pkg/admin/server.go b/pkg/admin/server.go index e5b9629..d588c29 100644 --- a/pkg/admin/server.go +++ b/pkg/admin/server.go @@ -198,31 +198,3 @@ func sendForwardCommand(adminAddr string, cmd Command) error { return nil } - -// RequestJoinCluster requests to join a cluster via the admin API -func RequestJoinCluster(adminAddr, nodeID, nodeAddr string) error { - conn, err := net.DialTimeout("tcp", adminAddr, 5*time.Second) - if err != nil { - return fmt.Errorf("connecting to admin server: %w", err) - } - defer conn.Close() - - message := fmt.Sprintf("ADD_VOTER %s %s", nodeID, nodeAddr) - if _, err := conn.Write([]byte(message)); err != nil { - return fmt.Errorf("sending join request: %w", err) - } - - // Read response - buffer := make([]byte, 1024) - n, err := conn.Read(buffer) - if err != nil { - return fmt.Errorf("reading response: %w", err) - } - - response := strings.TrimSpace(string(buffer[:n])) - if response != "OK" { - return fmt.Errorf("join request failed: %s", response) - } - - return nil -} diff --git a/scripts/cleanup_replication.sh b/scripts/cleanup_replication.sh index 6d9f908..5f0de3e 100755 --- a/scripts/cleanup_replication.sh +++ b/scripts/cleanup_replication.sh @@ -4,8 +4,6 @@ echo "🧹 Performing thorough cleanup..." # Kill all replication-related processes echo "Killing replication processes..." -pkill -f "live_replication" 2>/dev/null || true -pkill -f "cmd/live_replication" 2>/dev/null || true pkill -f "raft_demo" 2>/dev/null || true pkill -f "cmd/raft_demo" 2>/dev/null || true diff --git a/scripts/cluster_manager.sh b/scripts/cluster_manager.sh index 2d36dd9..b764a42 100755 --- a/scripts/cluster_manager.sh +++ b/scripts/cluster_manager.sh @@ -13,7 +13,7 @@ DEFAULT_MONITOR_BASE_PORT=6001 DEFAULT_DASHBOARD_PORT=8080 DEFAULT_HOST="127.0.0.1" DEFAULT_DATA_DIR="data" -DEFAULT_BINARY="./bin/pickbox" +DEFAULT_BINARY="cmd/multi_replication/main.go" # Colors for output RED='\033[0;31m' @@ -84,8 +84,7 @@ CONFIGURATION FILE FORMAT (cluster.conf): DASHBOARD_PORT=8080 HOST=127.0.0.1 DATA_DIR=data - BINARY=./bin/pickbox -BINARY_ARGS="node multi" + BINARY=cmd/multi_replication/main.go EOF } @@ -147,7 +146,8 @@ cleanup_cluster() { # Kill processes local process_patterns=( - "pickbox" + "multi_replication" + "cmd/multi_replication" ) for pattern in "${process_patterns[@]}"; do diff --git a/scripts/run_live_replication.sh b/scripts/run_live_replication.sh deleted file mode 100755 index e161073..0000000 --- a/scripts/run_live_replication.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Clean up any existing processes and data -echo "🧹 Cleaning up existing processes and data..." -pkill -f "live_replication" 2>/dev/null || true -pkill -f "cmd/live_replication" 2>/dev/null || true -sleep 2 -rm -rf data/node{1,2,3} - -echo "🚀 Starting Live Replication Demo" -echo "=================================" -echo "" - -# Start node1 (leader) -echo "Starting node1 (leader) with file watching..." -go run cmd/live_replication/main.go -node node1 -port 8001 & -NODE1_PID=$! -sleep 4 - -# Start node2 -echo "Starting node2..." -go run cmd/live_replication/main.go -node node2 -port 8002 -join 127.0.0.1:8001 & -NODE2_PID=$! -sleep 2 - -# Start node3 -echo "Starting node3..." -go run cmd/live_replication/main.go -node node3 -port 8003 -join 127.0.0.1:8001 & -NODE3_PID=$! -sleep 2 - -# Add nodes to cluster -echo "🔗 Adding nodes to cluster..." -if command -v nc &> /dev/null; then - echo "ADD_VOTER node2 127.0.0.1:8002" | nc 127.0.0.1 9001 - sleep 1 - echo "ADD_VOTER node3 127.0.0.1:8003" | nc 127.0.0.1 9001 - sleep 2 -else - echo "⚠️ netcat not available, cluster will run in single-node mode" -fi - -echo "" -echo "✅ Live replication cluster is running!" -echo "======================================" -echo "" -echo "🧪 Testing Instructions:" -echo "1. Open another terminal" -echo "2. Edit files in data/node1/ (the leader)" -echo "3. Watch them automatically replicate to data/node2/ and data/node3/" -echo "" -echo "Example commands to try:" -echo " echo 'Hello World!' > data/node1/hello.txt" -echo " echo 'Line 2' >> data/node1/hello.txt" -echo " cp /etc/hosts data/node1/hosts_copy.txt" -echo "" -echo "Then check replication with:" -echo " cat data/node*/hello.txt" -echo " cat data/node*/hosts_copy.txt" -echo "" -echo "📁 Node directories:" -echo " - data/node1/ (leader - edit files here)" -echo " - data/node2/ (follower - files appear here)" -echo " - data/node3/ (follower - files appear here)" -echo "" -echo "🛑 Press Ctrl+C to stop all nodes" - -# Function to show live status -show_status() { - echo "" - echo "📊 Current Status:" - echo "==================" - for node in node1 node2 node3; do - if [ -d "data/$node" ]; then - file_count=$(find data/$node -name "*.txt" 2>/dev/null | wc -l) - echo " $node: $file_count text files" - fi - done -} - -# Cleanup function -cleanup() { - echo "" - echo "🛑 Stopping all nodes..." - kill $NODE1_PID $NODE2_PID $NODE3_PID 2>/dev/null - wait - echo "✅ All nodes stopped." - show_status -} - -trap cleanup EXIT - -# Show periodic status updates -while true; do - sleep 10 - show_status -done \ No newline at end of file diff --git a/scripts/run_multi_replication.sh b/scripts/run_multi_replication.sh index 8d371fd..d711ef6 100755 --- a/scripts/run_multi_replication.sh +++ b/scripts/run_multi_replication.sh @@ -10,19 +10,19 @@ echo "" # Start node1 (leader) echo "Starting node1 (leader) with multi-directional file watching..." -./bin/pickbox node multi --node-id node1 --port 8001 --bootstrap & +go run cmd/multi_replication/main.go -node node1 -port 8001 & NODE1_PID=$! sleep 4 # Start node2 echo "Starting node2 with multi-directional file watching..." -./bin/pickbox node multi --node-id node2 --port 8002 --join 127.0.0.1:8001 & +go run cmd/multi_replication/main.go -node node2 -port 8002 -join 127.0.0.1:8001 & NODE2_PID=$! sleep 2 # Start node3 echo "Starting node3 with multi-directional file watching..." -./bin/pickbox node multi --node-id node3 --port 8003 --join 127.0.0.1:8001 & +go run cmd/multi_replication/main.go -node node3 -port 8003 -join 127.0.0.1:8001 & NODE3_PID=$! sleep 2 diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index bc15880..3bfff71 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -153,7 +153,6 @@ cleanup() { # Kill any running processes pkill -f "multi_replication" 2>/dev/null || true - pkill -f "live_replication" 2>/dev/null || true pkill -f "go run.*replication" 2>/dev/null || true pkill -f "cluster_manager" 2>/dev/null || true @@ -210,20 +209,11 @@ cd cmd/multi_replication go build -o ../../bin/multi_replication . cd "$PROJECT_ROOT" -cd cmd/live_replication -go build -o ../../bin/live_replication . -cd "$PROJECT_ROOT" - if [ ! -f "bin/multi_replication" ]; then echo -e "${RED}Failed to build multi_replication binary${NC}" exit 1 fi -if [ ! -f "bin/live_replication" ]; then - echo -e "${RED}Failed to build live_replication binary${NC}" - exit 1 -fi - echo -e "${GREEN}✅ Build successful${NC}" # Make scripts executable diff --git a/scripts/tests/README.md b/scripts/tests/README.md index 56b89eb..85e3946 100644 --- a/scripts/tests/README.md +++ b/scripts/tests/README.md @@ -9,10 +9,7 @@ This directory contains automated test scripts for the distributed file storage - **Usage**: `./test_replication.sh` - **What it tests**: Runs basic, live, and multi-directional replication tests -### `test_live_replication.sh` -- **Purpose**: Tests live file watching and replication -- **Usage**: `./test_live_replication.sh` -- **What it tests**: File creation, modification, and real-time replication from leader to followers +# [REMOVED] test_live_replication.sh functionality has been deleted ### `test_multi_replication.sh` - **Purpose**: Tests multi-directional replication capabilities @@ -33,7 +30,7 @@ cd scripts/tests To run individual tests: ```bash cd scripts/tests -./test_live_replication.sh +# [REMOVED] ./test_live_replication.sh ./test_multi_replication.sh ``` diff --git a/scripts/tests/test_live_replication.sh b/scripts/tests/test_live_replication.sh deleted file mode 100755 index 93f81e5..0000000 --- a/scripts/tests/test_live_replication.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -echo "🧪 Testing Live Replication" -echo "============================" - -# Clean up any existing processes first -echo "Cleaning up any existing processes..." -chmod +x ../cleanup_replication.sh -../cleanup_replication.sh - -# Start the live replication in background -echo "Starting live replication cluster..." -../run_live_replication.sh > /tmp/live_replication.log 2>&1 & -DEMO_PID=$! - -# Wait for cluster to start -echo "Waiting for cluster to initialize..." -sleep 12 - -# Test file operations -echo "" -echo "🔧 Testing file operations..." - -# Test 1: Create a file -echo "Test 1: Creating test1.txt..." -echo "Hello from live replication test!" > data/node1/test1.txt -sleep 4 - -# Check replication -echo "Checking replication..." -for node in node1 node2 node3; do - if [ -f "data/$node/test1.txt" ]; then - echo "✅ $node: $(cat data/$node/test1.txt)" - else - echo "❌ $node: File not found!" - fi -done - -echo "" - -# Test 2: Modify the file -echo "Test 2: Modifying test1.txt..." -echo "Modified content!" >> data/node1/test1.txt -sleep 4 - -# Check replication -echo "Checking replication after modification..." -for node in node1 node2 node3; do - if [ -f "data/$node/test1.txt" ]; then - lines=$(wc -l < "data/$node/test1.txt") - echo "✅ $node: $lines lines" - else - echo "❌ $node: File not found!" - fi -done - -echo "" - -# Test 3: Create another file -echo "Test 3: Creating test2.txt..." -echo "Another test file" > data/node1/test2.txt -sleep 4 - -# Check file count -echo "Checking total file count..." -for node in node1 node2 node3; do - if [ -d "data/$node" ]; then - file_count=$(find data/$node -name "*.txt" ! -name "welcome.txt" 2>/dev/null | wc -l) - echo "✅ $node: $file_count test files" - else - echo "❌ $node: Directory not found!" - fi -done - -echo "" -echo "📊 Test Results Summary:" -echo "========================" - -# Final verification -all_good=true - -for node in node2 node3; do # Check followers - if [ ! -f "data/$node/test1.txt" ] || [ ! -f "data/$node/test2.txt" ]; then - echo "❌ $node: Missing replicated files" - all_good=false - else - # Check content matches - if cmp -s "data/node1/test1.txt" "data/$node/test1.txt" && cmp -s "data/node1/test2.txt" "data/$node/test2.txt"; then - echo "✅ $node: All files replicated correctly" - else - echo "❌ $node: File content mismatch" - all_good=false - fi - fi -done - -# Cleanup -echo "" -echo "🧹 Cleaning up..." -kill $DEMO_PID 2>/dev/null -wait - -if [ "$all_good" = true ]; then - echo "🎉 SUCCESS: Live replication is working perfectly!" - echo "" - echo "📋 What was tested:" - echo " ✅ File creation replication" - echo " ✅ File modification replication" - echo " ✅ Multiple file replication" - echo " ✅ Content consistency across nodes" - echo "" - echo "🎯 Try the interactive demo with: ./scripts/run_live_replication.sh" -else - echo "❌ FAILURE: Some replication tests failed" - echo "Check the logs in /tmp/live_replication.log" -fi \ No newline at end of file diff --git a/scripts/tests/test_n_replication.sh b/scripts/tests/test_n_replication.sh index 627820a..d91aa3d 100755 --- a/scripts/tests/test_n_replication.sh +++ b/scripts/tests/test_n_replication.sh @@ -11,7 +11,7 @@ DEFAULT_BASE_PORT=8001 DEFAULT_ADMIN_BASE_PORT=9001 DEFAULT_HOST="127.0.0.1" DEFAULT_DATA_DIR="data" -DEFAULT_BINARY="./bin/pickbox" +DEFAULT_BINARY="cmd/multi_replication/main.go" # Colors for output RED='\033[0;31m' diff --git a/scripts/tests/test_replication.sh b/scripts/tests/test_replication.sh index f827c4f..75f2aa4 100755 --- a/scripts/tests/test_replication.sh +++ b/scripts/tests/test_replication.sh @@ -11,7 +11,7 @@ echo "------------------------------------" echo "" echo "2. Testing live replication..." echo "------------------------------" -./test_live_replication.sh + echo "" echo "3. Testing multi-directional replication..." diff --git a/test/README.md b/test/README.md index f56e524..649bda6 100644 --- a/test/README.md +++ b/test/README.md @@ -12,7 +12,7 @@ This directory contains comprehensive tests for the Pickbox distributed file sto - **Vector Clock Tests**: Test distributed conflict resolution mechanisms - **Raft Manager Tests**: Test Raft consensus and FSM operations -#### Multi-Replication Tests (`cmd/pickbox/multi_replication_test.go`) +#### Multi-Replication Tests (`cmd/multi_replication/main_test.go`) - **FSM Tests**: Test file system state machine operations - **Command Tests**: Test command serialization/deserialization - **Hash Function Tests**: Test content deduplication mechanisms @@ -54,14 +54,14 @@ Performance testing for critical operations: ```bash # Unit tests only go test -v ./pkg/storage -go test -v ./cmd/pickbox +go test -v ./cmd/multi_replication # Integration tests only cd test && go test -v . # Benchmarks only go test -bench=. ./pkg/storage -go test -bench=. ./cmd/pickbox +go test -bench=. ./cmd/multi_replication ``` ### Coverage Reports @@ -131,8 +131,8 @@ go tool cover -func=coverage.out 3. **Build Issues** ```bash # Rebuild binary - cd cmd/pickbox - go build -o ../../bin/pickbox . + cd cmd/multi_replication + go build -o ../../bin/multi_replication . ``` ### Verbose Logging @@ -168,7 +168,7 @@ export PICKBOX_DEBUG=true cd pkg/storage && go test -bench=. -benchmem # Multi-replication benchmarks -cd cmd/pickbox && go test -bench=. -benchmem +cd cmd/multi_replication && go test -bench=. -benchmem # Custom benchmark runs go test -bench=BenchmarkHashContent -count=5 -benchtime=10s diff --git a/test/integration_test.go b/test/integration_test.go index 2f5805e..5666680 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -54,11 +54,11 @@ func cleanupTestEnvironment() { // startNode starts a multi-replication node in the background func startNode(t *testing.T, nodeID string, dataDir string, raftPort, adminPort int) *exec.Cmd { cmd := exec.Command( - "../bin/pickbox", "node", "multi", - "--node-id", nodeID, - "--data-dir", dataDir, - "--port", fmt.Sprintf("%d", raftPort), - "--admin-port", fmt.Sprintf("%d", adminPort), + "go", "run", "../cmd/multi_replication/main.go", + "-node", nodeID, + "-dir", dataDir, + "-port", fmt.Sprintf("%d", raftPort), + "-admin", fmt.Sprintf("%d", adminPort), ) // Set up logging for debugging diff --git a/test/n_node_failure_test.go b/test/n_node_failure_test.go index 5b9e0da..9140c1e 100644 --- a/test/n_node_failure_test.go +++ b/test/n_node_failure_test.go @@ -74,12 +74,12 @@ func (suite *failureTestSuite) restartNode(t *testing.T, nodeNum int) { joinAddr := "127.0.0.1:8001" // Always join to node1 cmd := exec.Command( - "../bin/pickbox", "node", "multi", - "--node-id", nodeID, - "--port", fmt.Sprintf("%d", port), - "--admin-port", fmt.Sprintf("%d", adminPort), - "--data-dir", dataDir, - "--join", joinAddr, + "go", "run", "../cmd/multi_replication/main.go", + "-node", nodeID, + "-port", fmt.Sprintf("%d", port), + "-admin-port", fmt.Sprintf("%d", adminPort), + "-data-dir", dataDir, + "-join", joinAddr, ) // Start in background diff --git a/test_runner.sh b/test_runner.sh index 4df73a0..570ab0a 100644 --- a/test_runner.sh +++ b/test_runner.sh @@ -262,7 +262,6 @@ cleanup() { # Kill test processes pkill -f "multi_replication" 2>/dev/null || true - pkill -f "live_replication" 2>/dev/null || true pkill -f "go run.*replication" 2>/dev/null || true pkill -f "cluster_manager" 2>/dev/null || true @@ -390,10 +389,6 @@ if [ "$DRY_RUN" = false ]; then go build -o ../../bin/multi_replication . || exit 1 cd - > /dev/null - cd cmd/live_replication - go build -o ../../bin/live_replication . || exit 1 - cd - > /dev/null - echo -e "${GREEN}✅ Binaries built successfully${NC}" else echo -e "${YELLOW}[DRY RUN] Would build binaries${NC}"