b4prog · b4prog · Sep 9, 2025 · Sep 7, 2025 · Sep 7, 2025 · Sep 7, 2025
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
@@ -0,0 +1,28 @@
+language: en
+
+reviews:
+  profile: assertive
+  high_level_summary: true
+  poem: false
+  auto_review:
+    enabled: true
+    drafts: true
+
+  tools:
+    golangci-lint:
+      enabled: true
+    gitleaks:
+      enabled: true
+    yamllint:
+      enabled: true
+    actionlint:
+      enabled: true
+    shellcheck:
+      enabled: true
+    markdownlint:
+      enabled: true
+    osvScanner:
+      enabled: true
+    github-checks:
+      enabled: true
+      timeout_ms: 300000
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,96 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+
+    # Run from the repo root now that go.mod is at top level
+    defaults:
+      run:
+        shell: bash
+        working-directory: .
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: "1.25.x"
+          check-latest: true
+          cache: true
+          cache-dependency-path: go.sum
+
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: "1.25.x"
-          check-latest: true
-          cache: true
-          cache-dependency-path: go.sum
+      - name: Set up Go
+        uses: actions/setup-go@<commit-sha>
+        with:
+          go-version-file: go.mod
+          check-latest: true
+          cache: true
+          cache-dependency-path: go.sum
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: "1.25.x"
-          check-latest: true
-          cache: true
-          cache-dependency-path: go.sum
+      - name: Set up Go
+        uses: actions/setup-go@<commit-sha>
+        with:
+          go-version-file: go.mod
+          check-latest: true
+          cache: true
+          cache-dependency-path: go.sum
+      - name: Verify formatting (gofmt)
+        run: |
+          unformatted=$(gofmt -l .)
+          if [[ -n "$unformatted" ]]; then
+            echo "These files are not gofmt-formatted:"
+            echo "$unformatted"
+            echo "Run: gofmt -w ."
+            echo "Diff:"
+            for f in $unformatted; do
+              echo "---- $f"
+              diff -u "$f" <(gofmt "$f") || true
+            done
+            exit 1
+          fi
+
+      - name: Tidy (verify no changes)
+        run: |
+          set -euo pipefail
+          cp go.mod go.mod.orig
+          had_go_sum=0
+          if [[ -f go.sum ]]; then
+            cp go.sum go.sum.orig
+            had_go_sum=1
+          fi
+          if ! go mod tidy; then
+            echo "'go mod tidy' failed"
+            mv -f go.mod.orig go.mod
+            if [[ $had_go_sum -eq 1 ]]; then mv -f go.sum.orig go.sum; else rm -f go.sum || true; fi
+            exit 1
+          fi
+          tidy_ok=0
+          diff -u go.mod.orig go.mod || tidy_ok=1
+          if [[ $had_go_sum -eq 1 ]]; then
+            diff -u go.sum.orig go.sum || tidy_ok=1
+          else
+            [[ -f go.sum ]] && tidy_ok=1
+          fi
+          mv -f go.mod.orig go.mod
+          if [[ $had_go_sum -eq 1 ]]; then mv -f go.sum.orig go.sum; else rm -f go.sum || true; fi
+          if [[ $tidy_ok -ne 0 ]]; then
+            echo "go.mod/go.sum would change. Run 'go mod tidy' locally and commit."
+            exit 1
+          fi
+
+      - name: Vet
+        run: go vet -mod=readonly ./...
+
+      - name: Build
+        run: go build -mod=readonly ./...
+
-      - name: Vet
-        run: go vet -mod=readonly ./...
-
-      - name: Build
-        run: go build -mod=readonly ./...
+      - name: Vet
+        run: go vet -mod=readonly ./...
+
+      - name: Lint (golangci-lint)
+        uses: golangci/golangci-lint-action@<commit-sha>
+        with:
+          version: latest
+          args: --timeout=5m
+
+      - name: Build
+        run: go build -mod=readonly ./...
-      - name: Vet
-        run: go vet -mod=readonly ./...
-
-      - name: Build
-        run: go build -mod=readonly ./...
+      - name: Vet
+        run: go vet -mod=readonly ./...
+
+      - name: Lint (golangci-lint)
+        uses: golangci/golangci-lint-action@<commit-sha>
+        with:
+          version: latest
+          args: --timeout=5m
+
+      - name: Build
+        run: go build -mod=readonly ./...
+      - name: Test (race, coverage)
+        run: go test -mod=readonly -race -covermode=atomic -coverprofile=coverage.out ./...
+
-      - name: Test (race, coverage)
-        run: go test -mod=readonly -race -covermode=atomic -coverprofile=coverage.out ./...
+      - name: Test (race, coverage)
+        run: go test -mod=readonly -race -shuffle=on -covermode=atomic -coverprofile=coverage.out ./...
-      - name: Test (race, coverage)
-        run: go test -mod=readonly -race -covermode=atomic -coverprofile=coverage.out ./...
+      - name: Test (race, coverage)
+        run: go test -mod=readonly -race -shuffle=on -covermode=atomic -coverprofile=coverage.out ./...
+      - name: Upload coverage (artifact)
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage
+          path: coverage.out
+          if-no-files-found: ignore
diff --git a/.gitignore b/.gitignore
@@ -28,5 +28,15 @@ go.work.sum
 .env
 
 # Editor/IDE
-# .idea/
-# .vscode/
+.idea/
+.vscode/*
+!.vscode/extensions.json
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+
+# Build artifacts
+/build/
+
+# Task cache directory
+.task/
diff --git a/README.md b/README.md
@@ -1,2 +1,46 @@
 # Log2CSV
-Log2CSV is a command-line tool that transforms any log into a CSV file using a regular expression.
+
+`Log2CSV` is a command-line tool written in Go that transforms raw log files into CSV.
+It extracts structured data from logs using a regular expression with named capture groups and writes CSV to STDOUT.
+
+---
+
+## Features
+
+- Reads log lines from **STDIN** and writes CSV to **STDOUT**.
+- Extracts fields using **named capture groups** (`(?P<Name>...)`).
+- The **CSV header row** is automatically generated from group names.
+- Preserves the input's line endings (LF/CRLF).
+- Optional **unmatched mode** (`-unmatched`) to print **unique non-matching lines** instead of CSV.
+
+## Usage
+
+Log2CSV reads from STDIN and, by default, converts matching lines to CSV written to STDOUT using the provided regular expression.
+
+- **CSV mode :** provide `-regexp` with named capture groups; matching lines become CSV rows (header generated automatically).
+- **Unmatched mode:** add `-unmatched` to print each **unique** input line that **does not** match the pattern (one per line), to STDOUT. No CSV is produced in this mode.
+
+### Example - convert UFW log to CSV
+
+```sh
+log2csv -regexp '^(?P<Timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?\+\d{2}:\d{2})\s+(?P<Hostname>\S+)\s+(?P<Facility>\S+):\s+\[\s*(?P<Kernel_Time>\d+(?:\.\d+)?)\]\s+\[(?P<Action>UFW\s+\S+)\]\s+IN=(?P<IN>\S*)\s+OUT=(?P<OUT>\S*)\s+MAC=(?P<MAC>\S*)\s+SRC=(?P<SRC>\S+)\s+DST=(?P<DST>\S+)\s+LEN=(?P<LEN>\d+)(?:(?:\s+(?:TOS=(?P<TOS>0x[0-9A-Fa-f]{2})\s+)?(?:PREC=(?P<PREC>0x[0-9A-Fa-f]{2})\s+)?(?:TTL=(?P<TTL>\d+)\s+)?ID=(?P<ID>\d+)(?:\s+(?P<DF>DF))?)|\s+TC=(?P<TC>\d+)\s+HOPLIMIT=(?P<HOPLIMIT>\d+)\s+FLOWLBL=(?P<FLOWLBL>[0-9A-Fa-fx]+))?\s+PROTO=(?P<PROTO>[A-Za-z0-9]+)(?:\s+(?:SPT|SP)=(?P<SPT>\d+))?(?:\s+(?:DPT|DP)=(?P<DPT>\d+))?(?:\s+WINDOW=(?P<WINDOW>\d+))?(?:\s+RES=(?P<RES>0x[0-9A-Fa-f]{2}))?(?:\s+(?P<TCP_Flags>(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE)(?:\s+(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE))*))?(?:\s+URGP=(?P<URGP>\d+))?(?:\s+TYPE=(?P<ICMP_TYPE>\d+))?(?:\s+CODE=(?P<ICMP_CODE>\d+))?(?:\s+SEQ=(?P<ICMP_SEQ>\d+))?(?:\s+LEN=(?P<L4_LEN>\d+))?\s*$' < /var/log/ufw.log
+```
+
+On Windows
+
+```powershell
+Get-Content C:\path\ufw.log | log2csv -regexp "^(?P<Timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?\+\d{2}:\d{2})\s+(?P<Hostname>\S+)\s+(?P<Facility>\S+):\s+\[\s*(?P<Kernel_Time>\d+(?:\.\d+)?)\]\s+\[(?P<Action>UFW\s+\S+)\]\s+IN=(?P<IN>\S*)\s+OUT=(?P<OUT>\S*)\s+MAC=(?P<MAC>\S*)\s+SRC=(?P<SRC>\S+)\s+DST=(?P<DST>\S+)\s+LEN=(?P<LEN>\d+)(?:(?:\s+(?:TOS=(?P<TOS>0x[0-9A-Fa-f]{2})\s+)?(?:PREC=(?P<PREC>0x[0-9A-Fa-f]{2})\s+)?(?:TTL=(?P<TTL>\d+)\s+)?ID=(?P<ID>\d+)(?:\s+(?P<DF>DF))?)|\s+TC=(?P<TC>\d+)\s+HOPLIMIT=(?P<HOPLIMIT>\d+)\s+FLOWLBL=(?P<FLOWLBL>[0-9A-Fa-fx]+))?\s+PROTO=(?P<PROTO>[A-Za-z0-9]+)(?:\s+(?:SPT|SP)=(?P<SPT>\d+))?(?:\s+(?:DPT|DP)=(?P<DPT>\d+))?(?:\s+WINDOW=(?P<WINDOW>\d+))?(?:\s+RES=(?P<RES>0x[0-9A-Fa-f]{2}))?(?:\s+(?P<TCP_Flags>(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE)(?:\s+(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE))*))?(?:\s+URGP=(?P<URGP>\d+))?(?:\s+TYPE=(?P<ICMP_TYPE>\d+))?(?:\s+CODE=(?P<ICMP_CODE>\d+))?(?:\s+SEQ=(?P<ICMP_SEQ>\d+))?(?:\s+LEN=(?P<L4_LEN>\d+))?\s*$".
+```
+
+## Install
+
+```sh
+go install github.com/b4prog/Log2CSV@latest
+```
+
+## Build from source
+
+```sh
+task build
+./build/log2csv -help
+```
diff --git a/cmd/log2csv/log2csv.go b/cmd/log2csv/log2csv.go
@@ -0,0 +1,201 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/csv"
+	"errors"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"regexp"
+	"strings"
+)
+
+const (
+	csvSeparator   = ','
+	logLineSizeMax = 64 * 1024
+	bufferSizeMax  = 10 * 1024 * 1024
+)
+
+var (
+	// ErrInvalidRegexp is returned when the provided regular expression
+	// cannot be compiled due to invalid syntax.
+	ErrInvalidRegexp = errors.New("invalid regular expression syntax")
+	// ErrNoNamedCaptureGroups is returned when the provided regular expression
+	// does not contain any named capture groups (e.g. (?P<name>...)).
+	ErrNoNamedCaptureGroups = errors.New("the regular expression must contain at least one named capture group")
+)
+
+func usage() {
+	msg := `Usage:
+  log2csv -regexp '<pattern with (?P<name>...) groups>' [-unmatched]
+
+Description:
+  Reads log lines from STDIN, extracts named capture groups using the provided regular expression,
+  and writes a CSV to STDOUT.
+
+  If -unmatched is provided, the tool instead prints the unique non-matching lines (one per line) to STDOUT.
+
+Examples:
+  # CSV mode (default)
+  log2csv -regexp '^(?P<Timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?\+\d{2}:\d{2})\s+(?P<Hostname>\S+)\s+(?P<Facility>\S+):\s+\[\s*(?P<Kernel_Time>\d+(?:\.\d+)?)\]\s+\[(?P<Action>UFW\s+\S+)\]\s+IN=(?P<IN>\S*)\s+OUT=(?P<OUT>\S*)\s+MAC=(?P<MAC>\S*)\s+SRC=(?P<SRC>\S+)\s+DST=(?P<DST>\S+)\s+LEN=(?P<LEN>\d+)(?:(?:\s+(?:TOS=(?P<TOS>0x[0-9A-Fa-f]{2})\s+)?(?:PREC=(?P<PREC>0x[0-9A-Fa-f]{2})\s+)?(?:TTL=(?P<TTL>\d+)\s+)?ID=(?P<ID>\d+)(?:\s+(?P<DF>DF))?)|\s+TC=(?P<TC>\d+)\s+HOPLIMIT=(?P<HOPLIMIT>\d+)\s+FLOWLBL=(?P<FLOWLBL>[0-9A-Fa-fx]+))?\s+PROTO=(?P<PROTO>[A-Za-z0-9]+)(?:\s+(?:SPT|SP)=(?P<SPT>\d+))?(?:\s+(?:DPT|DP)=(?P<DPT>\d+))?(?:\s+WINDOW=(?P<WINDOW>\d+))?(?:\s+RES=(?P<RES>0x[0-9A-Fa-f]{2}))?(?:\s+(?P<TCP_Flags>(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE)(?:\s+(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE))*))?(?:\s+URGP=(?P<URGP>\d+))?(?:\s+TYPE=(?P<ICMP_TYPE>\d+))?(?:\s+CODE=(?P<ICMP_CODE>\d+))?(?:\s+SEQ=(?P<ICMP_SEQ>\d+))?(?:\s+LEN=(?P<L4_LEN>\d+))?\s*$' < /var/log/ufw.log
+
+  # List unique non-matching lines
+  log2csv -regexp '<your-regex>' -unmatched < /var/log/ufw.log
+`
+	fmt.Fprint(os.Stderr, msg)
+}
+
+func main() {
+	if err := run(); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+func run() error {
+	pattern := flag.String("regexp", "", "regular expression with named capture groups, e.g. '(?P<ts>...) (?P<level>...)'")
+	listUnmatched := flag.Bool("unmatched", false, "only list unique non-matching lines from STDIN")
+	flag.Usage = usage
+	flag.Parse()
+	if strings.TrimSpace(*pattern) == "" {
+		usage()
+		return fmt.Errorf("flag -regexp is required")
+	}
+	re, err := regexp.Compile(*pattern)
+	if err != nil {
+		return fmt.Errorf("%w: %v", ErrInvalidRegexp, err)
+	}
+	groupNames := extractGroupNames(re)
+	if len(groupNames) == 0 {
+		return ErrNoNamedCaptureGroups
+	}
+	out := bufio.NewWriter(os.Stdout)
+	err = processInput(os.Stdin, re, groupNames, out, *listUnmatched)
+	if flushErr := out.Flush(); err == nil && flushErr != nil {
+		err = flushErr
+	}
+	return err
+}
+
+func extractGroupNames(re *regexp.Regexp) []string {
+	names := re.SubexpNames()[1:]
+	ordered := make([]string, 0, len(names))
+	for _, name := range names {
+		if name != "" {
+			ordered = append(ordered, name)
+		}
+	}
+	return ordered
+}
+
+func processInput(input io.Reader, re *regexp.Regexp, groupNames []string, output io.Writer, listUnmatched bool) error {
+	inputReader, lineEnding, err := peekForLineEnding(input, logLineSizeMax)
+	if err != nil {
+		return err
+	}
+	sc := openInput(inputReader)
+	csvWriter := csv.NewWriter(output)
+	csvWriter.Comma = csvSeparator
+	csvWriter.UseCRLF = lineEnding == "\r\n"
+	firstLine := true
+	ignoredLines := 0
+	var firstIgnoredLine string
+	// Track unique non-matching lines when -unmatched is set.
+	seenUnmatched := make(map[string]struct{})
+	for sc.Scan() {
+		line := sc.Text()
+		values, ok := processLine(line, re, groupNames)
+		if !ok {
+			if strings.TrimSpace(line) != "" {
+				if listUnmatched {
+					if _, exists := seenUnmatched[line]; !exists {
+						seenUnmatched[line] = struct{}{}
+						// In unmatched mode, print each unique non-matching line once, preserving line endings.
+						if _, err := io.WriteString(output, line+lineEnding); err != nil {
+							return err
+						}
+					}
+				} else {
+					ignoredLines++
+					if firstIgnoredLine == "" {
+						firstIgnoredLine = line
+					}
+				}
+			}
+			continue
+		}
+		// In unmatched mode, we skip matched lines entirely.
+		if listUnmatched {
+			continue
+		}
+		if firstLine {
+			firstLine = false
+			if err := csvWriter.Write(groupNames); err != nil {
+				return err
+			}
+		}
+		if err := csvWriter.Write(values); err != nil {
+			return err
+		}
+	}
+	// Only warn about ignored lines in CSV mode.
+	if !listUnmatched && ignoredLines > 0 {
+		fmt.Fprintf(os.Stderr, "\nwarning: %d log line(s) did not match the pattern and were ignored\nfirst ignored line: %q\n", ignoredLines, firstIgnoredLine)
+	}
+	if err := sc.Err(); err != nil {
+		return err
+	}
+	csvWriter.Flush()
+	return csvWriter.Error()
+}
+
+func peekForLineEnding(input io.Reader, sizeMaxPeek int) (io.Reader, string, error) {
+	inputBuffer := bufio.NewReader(input)
+	sample, err := inputBuffer.Peek(sizeMaxPeek)
+	if err != nil && err != io.EOF && !errors.Is(err, bufio.ErrBufferFull) {
+		return nil, "", err
+	}
+	if idx := bytes.IndexByte(sample, '\n'); idx >= 0 {
+		if idx > 0 && sample[idx-1] == '\r' {
+			return inputBuffer, "\r\n", nil
+		}
+		return inputBuffer, "\n", nil
+	}
+	return inputBuffer, "\n", nil
+}
+
+func openInput(input io.Reader) *bufio.Scanner {
+	inputScanner := bufio.NewScanner(input)
+	buf := make([]byte, 0, logLineSizeMax)
+	inputScanner.Buffer(buf, bufferSizeMax)
+	return inputScanner
+}
+
+// Process a log line and returns CSV values + true if the line is valid, or nil + false if the line should be ignored.
+func processLine(line string, re *regexp.Regexp, groupNames []string) ([]string, bool) {
+	submatches := re.FindStringSubmatch(line)
+	if submatches == nil {
+		return nil, false
+	}
+	subNames := re.SubexpNames()
+	values := make([]string, 0, len(groupNames))
+	allEmpty := true
+	for idxSubmatch := 1; idxSubmatch < len(submatches); idxSubmatch++ {
+		name := subNames[idxSubmatch]
+		if name == "" {
+			continue
+		}
+		val := submatches[idxSubmatch]
+		if val != "" {
+			allEmpty = false
+		}
+		values = append(values, val)
+	}
+	if allEmpty {
+		return nil, false
+	}
+	return values, true
+}
diff --git a/go.mod b/go.mod
@@ -0,0 +1,3 @@
+module github.com/b4prog/Log2CSV
+
+go 1.25