From c97e4e3601d99bbc862b9cf9d82a52b6d253e59e Mon Sep 17 00:00:00 2001 From: ChanHHOO Date: Mon, 27 Oct 2025 22:02:09 +0900 Subject: [PATCH] feat : Update docs --- README.md | 530 ++++++++++++++++++++------------ src/test/resources/sample.nem12 | 8 + 2 files changed, 341 insertions(+), 197 deletions(-) create mode 100644 src/test/resources/sample.nem12 diff --git a/README.md b/README.md index 539608d..da77236 100644 --- a/README.md +++ b/README.md @@ -1,274 +1,410 @@ # NEM12 Parser -A Kotlin-based parser for NEM12 format energy meter reading files, generating SQL INSERT statements for the `meter_readings` database table. +## 1. Overview -## Features +### 1-1. Project Overview -- **Streaming Processing**: Handles very large files efficiently with constant memory usage -- **State Machine Pattern**: Robust parsing of hierarchical NEM12 file structure -- **Multiple Output Modes**: - - PostgreSQL COPY (fastest, recommended) - - Batch INSERT (standard SQL, compatible) -- **Production-Grade**: Error handling, logging, validation -- **Written in Kotlin**: Concise, type-safe, modern +A production-grade Kotlin-based parser for NEM12 format energy meter reading files. The parser reads NEM12 files, validates their contents, and stores meter readings and parsing failures in a database. -## Quick Start +**Key Features:** +- **Best-Effort Parsing**: Continues processing even when individual records fail validation +- **Dual Failure Handling**: Saves failures to database AND logs them to console in real-time +- **Batch Processing**: Optimized batch inserts for high performance +- **Timezone Conversion**: Automatically converts AEST/AEDT timestamps to UTC +- **Test code**: Wrote test code for server stability. -### Prerequisites +**What it does:** +1. Reads NEM12 format files line-by-line +2. Validates each record against NEM12 specifications +3. Converts meter readings to UTC timezone +4. Stores valid readings in `meter_reading` table +5. Stores failed records in `failure_reading` table with detailed error information +6. Logs all failures to console for real-time monitoring -- JDK 11 or higher -- Gradle (or use included wrapper) +### 1-2. Dependencies -### Build +**Runtime Dependencies:** +- **JDK 21**: Java Development Kit +- **Kotlin**: Modern JVM language with null-safety and type inference +- **SQLite**: Embedded database for data storage + +**Development Dependencies:** +- **Gradle**: Build automation tool +- **JUnit**: Testing framework +- **Kotest**: Kotlin-specific assertions +- **Ktlint**: Kotlin code style checker and formatter + +### 1-3. How to Run + +#### Prerequisites +- JDK 21 or higher installed +- No additional software required (SQLite is embedded) + +#### Build the Project ```bash +# Clone the repository +cd nem12-parser + +# Build the project (runs tests automatically) ./gradlew clean build ``` -This generates: `build/libs/nem12-parser-1.0.0-standalone.jar` - -### Usage +#### Run the Parser +**Basic Usage:** ```bash -# Using PostgreSQL COPY (fastest) -java -jar build/libs/nem12-parser-1.0.0-standalone.jar input.nem12 output.sql +java -jar build/libs/nem12-parser-1.0.0-standalone.jar +``` -# Using Batch INSERT -java -jar build/libs/nem12-parser-1.0.0-standalone.jar input.nem12 output.sql --mode=batch +**Examples:** +```bash +# Parse a NEM12 file and store results in output.db +java -jar build/libs/nem12-parser-1.0.0-standalone.jar ./src/test/resources/sample.nem12 output.db -# Custom batch size -java -jar build/libs/nem12-parser-1.0.0-standalone.jar input.nem12 output.sql --mode=batch --batch-size=500 +# Custom batch size (default: 50) +java -jar build/libs/nem12-parser-1.0.0-standalone.jar ./src/test/resources/sample.nem12 output.db --batch-size=500 ``` -### Execute SQL +#### Using Gradle Run Task (Development) ```bash -# PostgreSQL -psql -d your_database -f output.sql +# Run with default test file +./gradlew run + +# Run with custom arguments +./gradlew run --args="input.csv output.db" ``` -## Architecture +#### View Results + +**Query the database:** +```bash +# Open SQLite database +sqlite3 output.db -### Core Components +# View successful meter readings +SELECT * FROM meter_reading LIMIT 10; +# View failed records with reasons +SELECT line_number, failure_reason, nmi, raw_value +FROM failure_reading +ORDER BY line_number; ``` -┌──────────────┐ -│ NEM12 File │ -└──────┬───────┘ - │ - ▼ -┌──────────────────┐ -│ NEM12Parser │ State Machine -│ - ParserState │ - Tracks NMI context -│ - RecordParser │ - Validates structure -└──────┬───────────┘ - │ - ▼ -┌──────────────────┐ -│ SQLGenerator │ Strategy Pattern -│ - CopyCommand │ - PostgreSQL COPY -│ - BatchInsert │ - Standard INSERT -└──────┬───────────┘ - │ - ▼ -┌──────────────┐ -│ SQL Output │ -└──────────────┘ + +#### Run Tests + +```bash +# Run all tests +./gradlew test ``` -### Key Classes +#### Code Quality Checks -#### 1. Data Model +```bash -```kotlin -data class MeterReading( - val nmi: String, - val timestamp: LocalDateTime, - val consumption: BigDecimal -) +# Auto-format code +./gradlew ktlintFormat ``` -#### 2. Parser +#### Output Files -```kotlin -class NEM12Parser(private val sqlGenerator: SQLGenerator) { - fun parse(filePath: Path) { - // Stream-based line-by-line processing - // State machine handles 100/200/300/500/900 records - } -} +After running the parser, you'll find: +- **`.db`**: SQLite database with two tables: + - `meter_reading`: Successfully parsed meter readings + - `failure_reading`: Failed records with error details + +#### Console Output Example + +``` +INFO - Starting to parse file: sample.nem12 +WARN - Parsing failure - Line 15: NEGATIVE_VALUE (NMI: 1234567890, Interval: 5, Time: 2024-01-01T12:00, Raw: '-10.5') +INFO - Successfully parsed 1523 lines +INFO - Parsing completed successfully +Database created: output.db +Failed records: + NEGATIVE_VALUE: 2 + EMPTY_VALUE: 5 + INTERVAL_COUNT_MISMATCH: 1 +Failed records database: output.db ``` -#### 3. SQL Generators +--- -```kotlin -interface SQLGenerator : Closeable { - fun addReading(reading: MeterReading) - fun flush() -} +## 2. Architecture + +### 2-1. Project Architecture Overview -class CopyCommandGenerator(outputPath: Path) : SQLGenerator -class BatchInsertGenerator(outputPath: Path, batchSize: Int) : SQLGenerator +The NEM12 Parser follows a **Layered Architecture** pattern with clear separation of concerns across three main layers: + +``` +┌─────────────────────────────────────────────────────────┐ +│ Main (CLI) │ +│ - Command-line argument parsing │ +│ - Dependency injection setup │ +└────────────────────┬────────────────────────────────────┘ + │ + ┌───────────┴───────────┐ + ▼ ▼ +┌──────────────────┐ ┌──────────────────┐ +│ Failure Handler │ │ Parser Service │ +│ (Composite) │◄───│ (NEM12Parser) │ +├──────────────────┤ ├──────────────────┤ +│ - Database │ │ - File reading │ +│ - Logging │ │ - State machine │ +│ - (Extensible) │ │ - Validation │ +└──────────────────┘ └────────┬─────────┘ + │ + ┌────────────┴────────────┐ + ▼ ▼ + ┌──────────────────┐ ┌──────────────────┐ + │ Record Parser │ │ Repository │ + │ Service │ │ (Data Access) │ + ├──────────────────┤ ├──────────────────┤ + │ - Interval data │ │ - Meter reading │ + │ - Validation │ │ - Failure record │ + │ - Failure notify │ │ - Batch insert │ + └──────────────────┘ └──────────────────┘ + │ + ▼ + ┌──────────────────┐ + │ SQLite Database │ + │ - meter_reading │ + │ - failure_reading│ + └──────────────────┘ ``` -## Database Schema +### 2-2. Component Descriptions + +#### **Layer 1: Handler (Entry Point)** + +**Main.kt** - Application entry point +- Parses command-line arguments +- Creates and wires dependencies +- Orchestrates the parsing workflow +- Displays statistics and results + +#### **Layer 2: Service (Business Logic)** + +**NEM12ParserService** - Main parsing orchestration +- Reads NEM12 file line-by-line +- Maintains parser state +- Delegates interval data parsing to RecordParserService +- Saves valid readings to repository + +**RecordParserService** - Interval data parsing and validation +- Parses 300 (interval data) records +- Checks interval count against expected count +- Notifies FailureHandler for invalid records +- Returns list of valid MeterReading objects + + +#### **Layer 3: Handler (Failure Processing)** + +**FailureHandler (Interface)** - Defines failure handling contract + +**Implementations:** +1. **DatabaseFailureHandler** - Persists failures to SQLite +2. **LoggingFailureHandler** - Logs failures to console +3. **CompositeFailureHandler** - Combines multiple handlers (Composite Pattern) + +**Benefits:** +- Easy to add new handlers (e.g., EmailHandler, MetricsHandler) +- Separation of concerns +- Testability through interfaces + +#### **Layer 4: Repository (Data Access)** + +**BaseSQLiteRepository** +- Handles batch processing +- **Provides timezone conversion utility (AEST → UTC)** +- Implements common operations + +**Concrete Implementations:** +1. **MeterReadingRepositoryImpl** + - Stores valid meter readings + - Generates UUID for each record + - Tracks total inserted count + +2. **FailureReadingsRepositoryImpl** + - Stores failed parsing records + - Tracks statistics by failure reason + - Supports nullable fields (timestamp, interval index) + +### 2-3. Design Patterns Used + +| Pattern | Usage | Benefit | +|---------|-------|---------| +| **Layered Architecture** | Handler-Service-Repository | Separation of concerns, testability | +| **Template Method** | BaseSQLiteRepository | Code reuse, consistent behavior | +| **Composite** | CompositeFailureHandler | Combine multiple handlers flexibly | +| **Strategy** | FailureHandler implementations | Swap handling strategies at runtime | +| **State Machine** | ParserState | Track NEM12 file structure hierarchy | +| **Dependency Injection** | Constructor injection | Loose coupling, testability | + +### 2-4. Database Schema + +**meter_reading table:** ```sql -create table meter_readings ( - id uuid default gen_random_uuid() not null, - nmi varchar(10) not null, - timestamp timestamp not null, - consumption numeric not null, - constraint meter_readings_pk primary key (id), - constraint meter_readings_unique_consumption unique (nmi, timestamp) +CREATE TABLE meter_reading ( + id TEXT PRIMARY KEY, -- UUID + nmi VARCHAR(10) NOT NULL, -- Meter identifier + timestamp TIMESTAMP NOT NULL, -- UTC timestamp + consumption NUMERIC NOT NULL, -- Energy consumption (15.4 format) + UNIQUE(nmi, timestamp) -- Prevent duplicates ); ``` -## NEM12 Format Overview +**failure_reading table:** +```sql +CREATE TABLE failure_reading ( + id TEXT PRIMARY KEY, -- UUID + line_number INTEGER NOT NULL, -- Source line in input file + nmi TEXT, -- Meter identifier + interval_index INTEGER, -- Interval position + raw_value TEXT NOT NULL, -- Original invalid value + failure_reason TEXT NOT NULL, -- Reason enum + timestamp TIMESTAMP -- Timestamp +); +``` + +--- -| Record Type | Code | Description | -|-------------|------|-------------| -| Header | 100 | File metadata | -| NMI Data | 200 | Meter identifier and interval settings | -| Interval Data | 300 | Actual consumption readings (48 values for 30-min intervals) | -| NMI End | 500 | End of meter data block | -| File End | 900 | End of file | +## 3. Major Decisions -### Example Transformation +### 3-1. File Reading Strategy -**Input (NEM12):** +**Decision: Streaming (line-by-line) approach** + +```kotlin +// Using BufferedReader with lineSequence() +cmd.inputPath.bufferedReader().use { reader -> + reader.lineSequence().forEach { line -> + parseLine(line.trim()) + } +} ``` -200,NEM1201009,E1E2,1,E1,N1,01009,kWh,30,20050610 -300,20050301,0.461,0.810,... + +**Why:** +- No need to load entire file into memory + +### 3-2. Best-Effort Parsing + +**Decision: Continue parsing even when individual records fail** + +```kotlin +failureHandler.use { + for (i in 0 until expectedIntervals) { + if (!isValid(value)) { + failureHandler.handleFailure(record) // Log and continue + continue + } + readings.add(validReading) + } +} ``` -**Output (PostgreSQL COPY):** -```sql -COPY meter_readings (nmi, timestamp, consumption) FROM STDIN WITH (FORMAT CSV); -NEM1201009,2005-03-01 00:00:00,0.461 -NEM1201009,2005-03-01 00:30:00,0.810 -\. +**Why:** +- **Maximize data extraction** from partially corrupted files +- Better user experience (get some data vs. nothing) +- Detailed failure tracking for debugging + +**Alternative considered:** +- Fail-fast approach (stop on first error) +- Rejected because: Real-world files often have isolated errors + +### 3-3. Batch Insert Optimization + +**Decision: Buffer records and insert in batches** + +```kotlin +fun save(entity: T) { + insertStatement.addBatch() + batchCount++ + + if (batchCount >= batchSize) { + executeBatch() // Execute when batch is full + } +} ``` -## Design Decisions +**Why: Performance** +- Reduces database I/O operations +- Efficient use of database connection + +### 3-4. Timezone Conversion (AEST → UTC) + +**Decision: Convert all timestamps to UTC before storage** -### Q1: Technology Rationale +```kotlin +fun aestToUtc(timestamp: LocalDateTime): LocalDateTime { + return timestamp.atZone(AEST) + .withZoneSameInstant(UTC) + .toLocalDateTime() +} +``` -**Kotlin:** -- Modern, concise syntax reduces boilerplate -- Null-safety prevents common runtime errors -- Excellent Java interoperability -- Strong type system with data classes -- Expressive DSL capabilities +**Why:** +- **DST handling**: Automatically handles AEST ↔ AEDT transitions +- **International compatibility**: UTC is standard for data storage -**Gradle:** -- Kotlin DSL for type-safe build configuration -- Superior dependency management -- Fast incremental builds +**from Shishir** +> Input date timezone is AEST, UTC+10:00, and can be stored in the database as UTC -**PostgreSQL COPY:** -- 2-5x faster than batch INSERT -- Industry standard for bulk loading -- Direct database protocol optimization +### 3-5. Composite Handler Pattern -### Q2: Future Improvements +**Decision: Multiple failure handlers combined via CompositeFailureHandler** -Given more time, I would add: +```kotlin +val databaseHandler = DatabaseFailureHandler(repository) +val loggingHandler = LoggingFailureHandler() +val compositeHandler = CompositeFailureHandler(databaseHandler, loggingHandler) +``` -1. **Parallel Processing**: Chunk-based parallel parsing for multi-core utilization -2. **Progress Reporting**: Real-time progress bar for large files -3. **Data Quality Reports**: Statistics on skipped values, outliers, validation failures -4. **Direct Database Connection**: JDBC-based direct insert with transaction management -5. **Additional Output Formats**: JSON, Parquet, CSV for data analysis workflows -6. **Comprehensive Testing**: Property-based testing, performance benchmarks -7. **Resume Capability**: Checkpoint system to resume interrupted processing +**Why:** +- **Flexibility**: Enable/disable handlers independently +- **Extensibility**: Easy to add new handlers (email, metrics, etc.) +- **Single Responsibility**: Each handler does one thing -### Q3: Design Choices Rationale -1. **Streaming vs. Loading Entire File** - - **Choice**: Streaming with BufferedReader - - **Why**: Constant memory usage regardless of file size - - **Tradeoff**: Cannot random access, must process sequentially +--- -2. **State Machine Pattern** - - **Choice**: Explicit state tracking with ParserState - - **Why**: NEM12's hierarchical structure (200→300→500) - - **Tradeoff**: Slightly more complex than linear processing +## 4. How AI Was Used in This Project -3. **Strategy Pattern for SQL Generation** - - **Choice**: Interface with multiple implementations - - **Why**: Flexibility to choose optimal method per use case - - **Tradeoff**: Additional abstraction layer +### 4-1. Design Phase: Architecture Planning -4. **Immutable Data Classes** - - **Choice**: Kotlin data classes with `val` - - **Why**: Thread-safety, functional programming style - - **Tradeoff**: Cannot modify after creation +**Tool: Claude** -5. **Fail-Fast Error Handling** - - **Choice**: Throw ParseException on first error - - **Why**: Ensures data integrity, prevents bad data in DB - - **Tradeoff**: Cannot do partial processing (could add best-effort mode) +Used AI for architectural decision validation before implementation. -## Performance +**Example:** +- Validated Repository and Composite patterns -**Expected Performance (1GB file, ~10M records):** -- Parsing: ~20-30 seconds -- SQL Generation (COPY): ~10-15 seconds -- Memory Usage: <100MB (constant) -- **Total Time: ~30-45 seconds** +### 4-2. Initial Code Implementation -**Optimization Techniques:** -- Line-by-line streaming (no full file in memory) -- Batch buffering (reduces I/O operations) -- String operations without regex (faster parsing) -- Direct date construction (avoid DateTimeFormatter overhead) +**Tool: Claude Code** -## Testing +AI assisted with code generation and Kotlin idioms: +- Generated BaseSQLiteRepository structure +- Implemented timezone conversion logic +- Created test scaffolding -Run tests: -```bash -./gradlew test -``` +### 4-3. Automated Code Review -Sample test file included: `src/test/resources/sample.nem12` +**Tool: Claude Bot + GitHub Actions** -## Project Structure +Set up AI-powered code review on pull requests.([Sample](https://github.com/ChanHHOO/flo-energy-tech-assessment/pull/16)) -``` -nem12-parser/ -├── build.gradle.kts -├── src/ -│ ├── main/ -│ │ ├── kotlin/com/flo/nem12/ -│ │ │ ├── Main.kt -│ │ │ ├── model/ -│ │ │ │ ├── MeterReading.kt -│ │ │ │ └── RecordType.kt -│ │ │ ├── parser/ -│ │ │ │ ├── NEM12Parser.kt -│ │ │ │ ├── RecordParser.kt -│ │ │ │ ├── ParserState.kt -│ │ │ │ └── TimestampCalculator.kt -│ │ │ ├── generator/ -│ │ │ │ ├── SQLGenerator.kt -│ │ │ │ ├── CopyCommandGenerator.kt -│ │ │ │ └── BatchInsertGenerator.kt -│ │ │ └── exception/ -│ │ │ └── ParseException.kt -│ │ └── resources/ -│ │ └── logback.xml -│ └── test/ -│ ├── kotlin/com/flo/nem12/ -│ └── resources/ -│ └── sample.nem12 -└── README.md -``` +**Impact:** Instant feedback and validate code quality -## License +### 4-4. NEM12 Format Analysis -This project is for the Flo Energy Tech Assessment. +**Tool: Google NotebookLM** -## Author +Analyzed NEM12 specification documents to extract requirements. -Developed as part of Flo Energy technical assessment, demonstrating production-grade Kotlin development practices. +**Process:** +1. Uploaded NEM12 spec PDFs to NotebookLM +2. Asked questions about record types and validation rules +3. Generated summary of key requirements diff --git a/src/test/resources/sample.nem12 b/src/test/resources/sample.nem12 new file mode 100644 index 0000000..d2ce361 --- /dev/null +++ b/src/test/resources/sample.nem12 @@ -0,0 +1,8 @@ +100,NEM12,200506081149,UNITEDDP,NEMMCO +200,NEM1201009,E1E2,1,E1,N1,01009,kWh,30,20050610 +300,20050301,WRONGDATA,0,0,0,0,0,0,0,0,0,0,0,0.461,0.810,0.568,1.234,1.353,1.507,1.344,1.773,0.848,1.271,0.895,1.327,1.013,1.793,0.988,0.985,0.876,0.555,0.760,0.938,0.566,0.512,0.970,0.760,0.731,0.615,0.886,0.531,0.774,0.712,0.598,0.670,0.587,0.657,0.345,0.231,A,,,20050310121004,20050310182204 +300,20050302,0,0,0,0,0,0,0,0,0,0,0,0,0.235,0.567,0.890,1.123,1.345,1.567,1.543,1.234,0.987,1.123,0.876,1.345,1.145,1.173,1.265,0.987,0.678,0.998,0.768,0.954,0.876,0.845,0.932,0.786,0.999,0.879,0.777,0.578,0.709,0.772,0.625,0.653,0.543,0.599,0.432,0.432,A,,,20050310121004,20050310182204 +300,20050303,0,0,0,0,0,0,0,0,0,0,0,0,0.261,0.310,0.678,0.934,1.211,1.134,1.423,1.370,0.988,1.207,0.890,1.320,1.130,1.913,1.180,0.950,0.746,0.635,0.956,0.887,0.560,0.700,0.788,0.668,0.543,0.738,0.802,0.490,0.598,0.809,0.520,0.670,0.570,0.600,0.289,0.321,A,,,20050310121004,20050310182204 +300,20050304,0,0,0,0,0,0,0,0,0,0,0,0,0.335,0.667,0.790,1.023,1.145,1.777,1.563,1.344,1.087,1.453,0.996,1.125,1.435,1.263,1.085,1.487,1.278,0.768,0.878,0.754,0.476,1.045,1.132,0.896,0.879,0.679,0.887,0.784,0.954,0.712,0.599,0.593,0.674,0.799,0.232,0.612,A,,,20050310121004,20050310182204 +500,O,S01009,20050310121004, +900 \ No newline at end of file