diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 0000000..77316b3 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,28 @@ +language: en + +reviews: + profile: assertive + high_level_summary: true + poem: false + auto_review: + enabled: true + drafts: true + + tools: + golangci-lint: + enabled: true + gitleaks: + enabled: true + yamllint: + enabled: true + actionlint: + enabled: true + shellcheck: + enabled: true + markdownlint: + enabled: true + osvScanner: + enabled: true + github-checks: + enabled: true + timeout_ms: 300000 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..73ef2a2 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,96 @@ +name: CI + +on: + push: + pull_request: + +permissions: + contents: read + +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-test: + runs-on: ubuntu-latest + timeout-minutes: 20 + + # Run from the repo root now that go.mod is at top level + defaults: + run: + shell: bash + working-directory: . + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: "1.25.x" + check-latest: true + cache: true + cache-dependency-path: go.sum + + - name: Verify formatting (gofmt) + run: | + unformatted=$(gofmt -l .) + if [[ -n "$unformatted" ]]; then + echo "These files are not gofmt-formatted:" + echo "$unformatted" + echo "Run: gofmt -w ." + echo "Diff:" + for f in $unformatted; do + echo "---- $f" + diff -u "$f" <(gofmt "$f") || true + done + exit 1 + fi + + - name: Tidy (verify no changes) + run: | + set -euo pipefail + cp go.mod go.mod.orig + had_go_sum=0 + if [[ -f go.sum ]]; then + cp go.sum go.sum.orig + had_go_sum=1 + fi + if ! go mod tidy; then + echo "'go mod tidy' failed" + mv -f go.mod.orig go.mod + if [[ $had_go_sum -eq 1 ]]; then mv -f go.sum.orig go.sum; else rm -f go.sum || true; fi + exit 1 + fi + tidy_ok=0 + diff -u go.mod.orig go.mod || tidy_ok=1 + if [[ $had_go_sum -eq 1 ]]; then + diff -u go.sum.orig go.sum || tidy_ok=1 + else + [[ -f go.sum ]] && tidy_ok=1 + fi + mv -f go.mod.orig go.mod + if [[ $had_go_sum -eq 1 ]]; then mv -f go.sum.orig go.sum; else rm -f go.sum || true; fi + if [[ $tidy_ok -ne 0 ]]; then + echo "go.mod/go.sum would change. Run 'go mod tidy' locally and commit." + exit 1 + fi + + - name: Vet + run: go vet -mod=readonly ./... + + - name: Build + run: go build -mod=readonly ./... + + - name: Test (race, coverage) + run: go test -mod=readonly -race -covermode=atomic -coverprofile=coverage.out ./... + + - name: Upload coverage (artifact) + if: always() + uses: actions/upload-artifact@v4 + with: + name: coverage + path: coverage.out + if-no-files-found: ignore diff --git a/.gitignore b/.gitignore index aaadf73..4114b79 100644 --- a/.gitignore +++ b/.gitignore @@ -28,5 +28,15 @@ go.work.sum .env # Editor/IDE -# .idea/ -# .vscode/ +.idea/ +.vscode/* +!.vscode/extensions.json +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json + +# Build artifacts +/build/ + +# Task cache directory +.task/ \ No newline at end of file diff --git a/README.md b/README.md index 70abc12..f42ba48 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,46 @@ # Log2CSV -Log2CSV is a command-line tool that transforms any log into a CSV file using a regular expression. + +`Log2CSV` is a command-line tool written in Go that transforms raw log files into CSV. +It extracts structured data from logs using a regular expression with named capture groups and writes CSV to STDOUT. + +--- + +## Features + +- Reads log lines from **STDIN** and writes CSV to **STDOUT**. +- Extracts fields using **named capture groups** (`(?P...)`). +- The **CSV header row** is automatically generated from group names. +- Preserves the input's line endings (LF/CRLF). +- Optional **unmatched mode** (`-unmatched`) to print **unique non-matching lines** instead of CSV. + +## Usage + +Log2CSV reads from STDIN and, by default, converts matching lines to CSV written to STDOUT using the provided regular expression. + +- **CSV mode :** provide `-regexp` with named capture groups; matching lines become CSV rows (header generated automatically). +- **Unmatched mode:** add `-unmatched` to print each **unique** input line that **does not** match the pattern (one per line), to STDOUT. No CSV is produced in this mode. + +### Example - convert UFW log to CSV + +```sh +log2csv -regexp '^(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?\+\d{2}:\d{2})\s+(?P\S+)\s+(?P\S+):\s+\[\s*(?P\d+(?:\.\d+)?)\]\s+\[(?PUFW\s+\S+)\]\s+IN=(?P\S*)\s+OUT=(?P\S*)\s+MAC=(?P\S*)\s+SRC=(?P\S+)\s+DST=(?P\S+)\s+LEN=(?P\d+)(?:(?:\s+(?:TOS=(?P0x[0-9A-Fa-f]{2})\s+)?(?:PREC=(?P0x[0-9A-Fa-f]{2})\s+)?(?:TTL=(?P\d+)\s+)?ID=(?P\d+)(?:\s+(?PDF))?)|\s+TC=(?P\d+)\s+HOPLIMIT=(?P\d+)\s+FLOWLBL=(?P[0-9A-Fa-fx]+))?\s+PROTO=(?P[A-Za-z0-9]+)(?:\s+(?:SPT|SP)=(?P\d+))?(?:\s+(?:DPT|DP)=(?P\d+))?(?:\s+WINDOW=(?P\d+))?(?:\s+RES=(?P0x[0-9A-Fa-f]{2}))?(?:\s+(?P(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE)(?:\s+(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE))*))?(?:\s+URGP=(?P\d+))?(?:\s+TYPE=(?P\d+))?(?:\s+CODE=(?P\d+))?(?:\s+SEQ=(?P\d+))?(?:\s+LEN=(?P\d+))?\s*$' < /var/log/ufw.log +``` + +On Windows + +```powershell +Get-Content C:\path\ufw.log | log2csv -regexp "^(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?\+\d{2}:\d{2})\s+(?P\S+)\s+(?P\S+):\s+\[\s*(?P\d+(?:\.\d+)?)\]\s+\[(?PUFW\s+\S+)\]\s+IN=(?P\S*)\s+OUT=(?P\S*)\s+MAC=(?P\S*)\s+SRC=(?P\S+)\s+DST=(?P\S+)\s+LEN=(?P\d+)(?:(?:\s+(?:TOS=(?P0x[0-9A-Fa-f]{2})\s+)?(?:PREC=(?P0x[0-9A-Fa-f]{2})\s+)?(?:TTL=(?P\d+)\s+)?ID=(?P\d+)(?:\s+(?PDF))?)|\s+TC=(?P\d+)\s+HOPLIMIT=(?P\d+)\s+FLOWLBL=(?P[0-9A-Fa-fx]+))?\s+PROTO=(?P[A-Za-z0-9]+)(?:\s+(?:SPT|SP)=(?P\d+))?(?:\s+(?:DPT|DP)=(?P\d+))?(?:\s+WINDOW=(?P\d+))?(?:\s+RES=(?P0x[0-9A-Fa-f]{2}))?(?:\s+(?P(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE)(?:\s+(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE))*))?(?:\s+URGP=(?P\d+))?(?:\s+TYPE=(?P\d+))?(?:\s+CODE=(?P\d+))?(?:\s+SEQ=(?P\d+))?(?:\s+LEN=(?P\d+))?\s*$". +``` + +## Install + +```sh +go install github.com/b4prog/Log2CSV@latest +``` + +## Build from source + +```sh +task build +./build/log2csv -help +``` diff --git a/cmd/log2csv/log2csv.go b/cmd/log2csv/log2csv.go new file mode 100644 index 0000000..f46f941 --- /dev/null +++ b/cmd/log2csv/log2csv.go @@ -0,0 +1,201 @@ +package main + +import ( + "bufio" + "bytes" + "encoding/csv" + "errors" + "flag" + "fmt" + "io" + "os" + "regexp" + "strings" +) + +const ( + csvSeparator = ',' + logLineSizeMax = 64 * 1024 + bufferSizeMax = 10 * 1024 * 1024 +) + +var ( + // ErrInvalidRegexp is returned when the provided regular expression + // cannot be compiled due to invalid syntax. + ErrInvalidRegexp = errors.New("invalid regular expression syntax") + // ErrNoNamedCaptureGroups is returned when the provided regular expression + // does not contain any named capture groups (e.g. (?P...)). + ErrNoNamedCaptureGroups = errors.New("the regular expression must contain at least one named capture group") +) + +func usage() { + msg := `Usage: + log2csv -regexp '...) groups>' [-unmatched] + +Description: + Reads log lines from STDIN, extracts named capture groups using the provided regular expression, + and writes a CSV to STDOUT. + + If -unmatched is provided, the tool instead prints the unique non-matching lines (one per line) to STDOUT. + +Examples: + # CSV mode (default) + log2csv -regexp '^(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?\+\d{2}:\d{2})\s+(?P\S+)\s+(?P\S+):\s+\[\s*(?P\d+(?:\.\d+)?)\]\s+\[(?PUFW\s+\S+)\]\s+IN=(?P\S*)\s+OUT=(?P\S*)\s+MAC=(?P\S*)\s+SRC=(?P\S+)\s+DST=(?P\S+)\s+LEN=(?P\d+)(?:(?:\s+(?:TOS=(?P0x[0-9A-Fa-f]{2})\s+)?(?:PREC=(?P0x[0-9A-Fa-f]{2})\s+)?(?:TTL=(?P\d+)\s+)?ID=(?P\d+)(?:\s+(?PDF))?)|\s+TC=(?P\d+)\s+HOPLIMIT=(?P\d+)\s+FLOWLBL=(?P[0-9A-Fa-fx]+))?\s+PROTO=(?P[A-Za-z0-9]+)(?:\s+(?:SPT|SP)=(?P\d+))?(?:\s+(?:DPT|DP)=(?P\d+))?(?:\s+WINDOW=(?P\d+))?(?:\s+RES=(?P0x[0-9A-Fa-f]{2}))?(?:\s+(?P(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE)(?:\s+(?:SYN|ACK|FIN|RST|PSH|URG|CWR|ECE))*))?(?:\s+URGP=(?P\d+))?(?:\s+TYPE=(?P\d+))?(?:\s+CODE=(?P\d+))?(?:\s+SEQ=(?P\d+))?(?:\s+LEN=(?P\d+))?\s*$' < /var/log/ufw.log + + # List unique non-matching lines + log2csv -regexp '' -unmatched < /var/log/ufw.log +` + fmt.Fprint(os.Stderr, msg) +} + +func main() { + if err := run(); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +func run() error { + pattern := flag.String("regexp", "", "regular expression with named capture groups, e.g. '(?P...) (?P...)'") + listUnmatched := flag.Bool("unmatched", false, "only list unique non-matching lines from STDIN") + flag.Usage = usage + flag.Parse() + if strings.TrimSpace(*pattern) == "" { + usage() + return fmt.Errorf("flag -regexp is required") + } + re, err := regexp.Compile(*pattern) + if err != nil { + return fmt.Errorf("%w: %v", ErrInvalidRegexp, err) + } + groupNames := extractGroupNames(re) + if len(groupNames) == 0 { + return ErrNoNamedCaptureGroups + } + out := bufio.NewWriter(os.Stdout) + err = processInput(os.Stdin, re, groupNames, out, *listUnmatched) + if flushErr := out.Flush(); err == nil && flushErr != nil { + err = flushErr + } + return err +} + +func extractGroupNames(re *regexp.Regexp) []string { + names := re.SubexpNames()[1:] + ordered := make([]string, 0, len(names)) + for _, name := range names { + if name != "" { + ordered = append(ordered, name) + } + } + return ordered +} + +func processInput(input io.Reader, re *regexp.Regexp, groupNames []string, output io.Writer, listUnmatched bool) error { + inputReader, lineEnding, err := peekForLineEnding(input, logLineSizeMax) + if err != nil { + return err + } + sc := openInput(inputReader) + csvWriter := csv.NewWriter(output) + csvWriter.Comma = csvSeparator + csvWriter.UseCRLF = lineEnding == "\r\n" + firstLine := true + ignoredLines := 0 + var firstIgnoredLine string + // Track unique non-matching lines when -unmatched is set. + seenUnmatched := make(map[string]struct{}) + for sc.Scan() { + line := sc.Text() + values, ok := processLine(line, re, groupNames) + if !ok { + if strings.TrimSpace(line) != "" { + if listUnmatched { + if _, exists := seenUnmatched[line]; !exists { + seenUnmatched[line] = struct{}{} + // In unmatched mode, print each unique non-matching line once, preserving line endings. + if _, err := io.WriteString(output, line+lineEnding); err != nil { + return err + } + } + } else { + ignoredLines++ + if firstIgnoredLine == "" { + firstIgnoredLine = line + } + } + } + continue + } + // In unmatched mode, we skip matched lines entirely. + if listUnmatched { + continue + } + if firstLine { + firstLine = false + if err := csvWriter.Write(groupNames); err != nil { + return err + } + } + if err := csvWriter.Write(values); err != nil { + return err + } + } + // Only warn about ignored lines in CSV mode. + if !listUnmatched && ignoredLines > 0 { + fmt.Fprintf(os.Stderr, "\nwarning: %d log line(s) did not match the pattern and were ignored\nfirst ignored line: %q\n", ignoredLines, firstIgnoredLine) + } + if err := sc.Err(); err != nil { + return err + } + csvWriter.Flush() + return csvWriter.Error() +} + +func peekForLineEnding(input io.Reader, sizeMaxPeek int) (io.Reader, string, error) { + inputBuffer := bufio.NewReader(input) + sample, err := inputBuffer.Peek(sizeMaxPeek) + if err != nil && err != io.EOF && !errors.Is(err, bufio.ErrBufferFull) { + return nil, "", err + } + if idx := bytes.IndexByte(sample, '\n'); idx >= 0 { + if idx > 0 && sample[idx-1] == '\r' { + return inputBuffer, "\r\n", nil + } + return inputBuffer, "\n", nil + } + return inputBuffer, "\n", nil +} + +func openInput(input io.Reader) *bufio.Scanner { + inputScanner := bufio.NewScanner(input) + buf := make([]byte, 0, logLineSizeMax) + inputScanner.Buffer(buf, bufferSizeMax) + return inputScanner +} + +// Process a log line and returns CSV values + true if the line is valid, or nil + false if the line should be ignored. +func processLine(line string, re *regexp.Regexp, groupNames []string) ([]string, bool) { + submatches := re.FindStringSubmatch(line) + if submatches == nil { + return nil, false + } + subNames := re.SubexpNames() + values := make([]string, 0, len(groupNames)) + allEmpty := true + for idxSubmatch := 1; idxSubmatch < len(submatches); idxSubmatch++ { + name := subNames[idxSubmatch] + if name == "" { + continue + } + val := submatches[idxSubmatch] + if val != "" { + allEmpty = false + } + values = append(values, val) + } + if allEmpty { + return nil, false + } + return values, true +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..ad0cc13 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/b4prog/Log2CSV + +go 1.25 diff --git a/taskfile.yaml b/taskfile.yaml new file mode 100644 index 0000000..e212ea6 --- /dev/null +++ b/taskfile.yaml @@ -0,0 +1,102 @@ +version: "3" + +vars: + APP_NAME: Log2CSV + SRC_DIR: cmd/log2csv + BUILD_DIR: build + BIN_NAME: log2csv{{if eq .OS "windows"}}.exe{{end}} + CGO_ENABLED: "0" + LDFLAGS: "" + +tasks: + default: + desc: Build the project + cmds: + - task: build + + help: + desc: Show available tasks + silent: true + cmds: + - | + echo "Tasks:" + echo " task build # Compile to ./build/{{.BIN_NAME}}" + echo " task run -- [args] # Run the compiled binary with optional CLI args" + echo " task clean # Remove ./build" + echo " task fmt # go fmt ./..." + echo " task vet # go vet ./..." + echo " task tidy # go mod tidy" + echo " task deps # go mod download" + echo " task test # go test -race ./..." + + tidy: + desc: go mod tidy in SRC_DIR + dir: "{{.SRC_DIR}}" + cmds: + - go mod tidy + + deps: + desc: Download modules + dir: "{{.SRC_DIR}}" + cmds: + - go mod download + + fmt: + desc: go fmt + dir: "{{.SRC_DIR}}" + cmds: + - go fmt ./... + + vet: + desc: go vet + dir: "{{.SRC_DIR}}" + cmds: + - go vet ./... + + build: + desc: Build {{.APP_NAME}} to ./{{.BUILD_DIR}}/{{.BIN_NAME}} + deps: [deps, tidy, vet] + env: + CGO_ENABLED: "{{.CGO_ENABLED}}" + cmds: + - > + {{if eq .OS "windows"}} + powershell -NoProfile -Command "New-Item -ItemType Directory -Force {{.BUILD_DIR}}" + {{else}} + mkdir -p {{.BUILD_DIR}} + {{end}} + - go build -trimpath -ldflags "{{.LDFLAGS}}" -o {{.BUILD_DIR}}/{{.BIN_NAME}} ./{{.SRC_DIR}} + sources: + - "**/*.go" + - "go.mod" + - "go.sum" + generates: + - "{{.BUILD_DIR}}/{{.BIN_NAME}}" + + run: + desc: Build then run the binary (pass CLI args after --) + deps: [build] + cmds: + - > + {{if eq .OS "windows"}} + & ".\{{.BUILD_DIR}}\{{.BIN_NAME}}" {{.CLI_ARGS}} + {{else}} + "./{{.BUILD_DIR}}/{{.BIN_NAME}}" {{.CLI_ARGS}} + {{end}} + vars: + CLI_ARGS: "{{.CLI_ARGS}}" + test: + desc: Run unit tests + dir: "{{.SRC_DIR}}" + cmds: + - go test -race ./... + + clean: + desc: Remove build artifacts + cmds: + - > + {{if eq .OS "windows"}} + powershell -NoProfile -Command "if (Test-Path '{{.BUILD_DIR}}') { Remove-Item -Recurse -Force '{{.BUILD_DIR}}' }" + {{else}} + rm -rf "{{.BUILD_DIR}}" + {{end}}