diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e6c3dc7..d16734d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,26 +1,52 @@ -name: CI - -on: - push: - branches: ["main", "develop"] - pull_request: - branches: ["main", "develop"] - -jobs: - build-and-test: - runs-on: ubuntu-latest - - steps: - - name: ๐๏ธ Checkout - uses: actions/checkout@v2 - - - name: ๐ง Set up Go - uses: actions/setup-go@v2 - with: - go-version: "1.23" - - - name: ๐ Build - run: go build ./... - - - name: ๐งช Test - run: go test -v ./... +# File: .github/workflows/ci.yml + +name: Continuous Integration (CI) + +on: + push: + branches: ["*"] + pull_request: + branches: ["main", "develop"] + +jobs: + build-and-test: + name: ๐จ Build and Test + runs-on: ubuntu-latest + + steps: + - name: ๐๏ธ Checkout + uses: actions/checkout@v2 + + - name: ๐ง Set up Go + uses: actions/setup-go@v2 + with: + go-version: "1.23" + + - name: ๐ฆ Check go.sum consistency + run: | + go mod tidy + if ! git diff --exit-code go.sum; then + echo "go.sum is not up-to-date. Please run 'go mod tidy' and commit the changes." + exit 1 + fi + + - name: ๐ ๏ธ Build + run: go build ./... + + - name: ๐งช Run Tests with Coverage + run: | + go test -v ./... -coverprofile=coverage.out | tee test-results.log + grep -v "cmd/scrapeycli/main.go:" coverage.out > tmp && mv tmp coverage.out + + - name: ๐ค Upload Coverage Reports to Codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: coverage.out + + - name: ๐ Upload Test Logs on Failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: test-results + path: test-results.log diff --git a/.gitignore b/.gitignore index ee7c360..a59c494 100644 --- a/.gitignore +++ b/.gitignore @@ -1,33 +1,33 @@ -# Go binaries for programs and plugins -*.exe -*.exe~ -*.dll -*.so -*.dylib - -# Build artifacts and local bin directories -bin/ -build/ -dist/ - -# Test binaries (built with `go test -c`) -*.test - -# Output of the go coverage tool -*.out -coverage/ - -# Dependency directories -vendor/ - -# Go workspace files -go.work -go.work.sum - -# Editor/OS-specific -.DS_Store -.idea/ -.vscode/ - -# Environment variables -.env +# File: .gitignore +# Go binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Build artifacts and local bin directories +bin +build +dist + +# Test binaries (built with `go test -c`) +*.test + +# Dependency directories +vendor + +# Go workspace files +go.work +go.work.sum + +# Editor/OS-specific +.DS_Store +.idea + +# Environment variables +.env + +# Ignore all configs except default.json +configs/* +!configs/default.json \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..95ec900 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "editor.formatOnSave": true, + "[go]": { + "editor.defaultFormatter": "golang.go" + } +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a4c5f8b --- /dev/null +++ b/Makefile @@ -0,0 +1,139 @@ +# File: Makefile +# Purpose: +# - "install": ensure modules are tidy if go.mod or go.sum changed. +# - "test": run coverage if code changed. +# - "coverage": display coverage summary from coverage.out +# - "build": recompile binary if Go source or mod files changed. +# - "run": executes the compiled binary +# - "tree": display directory structure +# - All skip with "No changes detected, skipping X." if nothing changed. + +.PHONY: install test coverage build run tree + +# Directories for build artifacts and stamp files +BUILD_DIR := build +STAMPS_DIR := $(BUILD_DIR)/.stamps + +# Stamp file paths for each step +INSTALL_STAMP := $(STAMPS_DIR)/install.stamp +BUILD_STAMP := $(STAMPS_DIR)/build.stamp +TEST_STAMP := $(STAMPS_DIR)/test.stamp + +BINARY := $(BUILD_DIR)/scrapeycli + +# Coverage output +COVER_DIR := ${BUILD_DIR}/coverage +COVER_PROFILE := $(COVER_DIR)/coverage.txt +COVER_HTML := $(COVER_DIR)/coverage.html + +# All Go source files (including _test.go files) +GO_FILES := $(shell find . -type f -name '*.go' -not -path "./vendor/*") + +# Reusable messages +SKIP_MSG := No changes detected, skipping +CHANGE_MSG := Some files changed; re-running + +# ------------------------------------------------------------------------------ +# install: ensure go.mod/go.sum are tidy if changed. +# ------------------------------------------------------------------------------ +install: + @mkdir -p $(STAMPS_DIR) + @TARGET=install; \ + if [ ! -f "$(INSTALL_STAMP)" ] || [ go.mod -nt "$(INSTALL_STAMP)" ] || [ go.sum -nt "$(INSTALL_STAMP)" ]; then \ + echo "$(CHANGE_MSG) $$TARGET..."; \ + go mod tidy; \ + if ! git diff --exit-code go.sum; then \ + echo "go.sum updated. Please commit the changes."; \ + exit 1; \ + fi; \ + touch "$(INSTALL_STAMP)"; \ + echo "Done with installing."; \ + else \ + echo "$(SKIP_MSG) $$TARGET."; \ + fi + +# ------------------------------------------------------------------------------ +# test: run tests and update coverage if any Go source (including _test.go files) have changed. +# Ensures gotestsum is installed before running tests. +# Depends on install. +# ------------------------------------------------------------------------------ +test: + @if ! command -v gotestsum >/dev/null 2>&1; then \ + echo "Installing gotestsum..."; \ + go install gotest.tools/gotestsum@latest; \ + fi + @mkdir -p $(COVER_DIR) $(STAMPS_DIR) + @TARGET=test; \ + if [ ! -f "$(TEST_STAMP)" ] || [ -n "$$(find $(GO_FILES) -newer "$(TEST_STAMP)" 2>/dev/null)" ]; then \ + echo "$(CHANGE_MSG) $$TARGET..."; \ + > "$(COVER_PROFILE)"; \ + if gotestsum --format short-verbose ./... && \ + go test -cover -covermode=atomic -coverpkg=./... -coverprofile="$(COVER_PROFILE)" ./... >/dev/null; then \ + if [ -f "$(COVER_PROFILE)" ]; then \ + grep -v "cmd/scrapeycli/main.go:" "$(COVER_PROFILE)" > "$(COVER_PROFILE).tmp" && mv "$(COVER_PROFILE).tmp" "$(COVER_PROFILE)"; \ + go tool cover -html="$(COVER_PROFILE)" -o "$(COVER_HTML)"; \ + echo "Coverage file generated at: $(COVER_PROFILE)"; \ + echo "HTML coverage report at: $(COVER_HTML)"; \ + else \ + echo "ERROR: Coverage file was not generated!"; \ + fi; \ + touch "$(TEST_STAMP)"; \ + else \ + echo "Tests failed! Skipping stamp update."; \ + exit 1; \ + fi; \ + else \ + echo "$(SKIP_MSG) $$TARGET."; \ + fi + +# ------------------------------------------------------------------------------ +# coverage: displays a colorized coverage summary from the coverage file. +# Depends on test. +# ------------------------------------------------------------------------------ +coverage: test + @echo "================== COVERAGE SUMMARY ==================" + @go tool cover -func="$(COVER_PROFILE)" | go run ./scripts/coverage_formatter.go + @echo "=====================================================" + +# ------------------------------------------------------------------------------ +# build: compile binary if Go sources changed. +# Depends on install. +# ------------------------------------------------------------------------------ +build: install + @mkdir -p $(BUILD_DIR) + @mkdir -p $(STAMPS_DIR) + @TARGET=build; \ + if [ ! -f "$(BUILD_STAMP)" ] || [ -n "$$(find $(GO_FILES) -newer "$(BUILD_STAMP)" 2>/dev/null)" ]; then \ + echo "$(CHANGE_MSG) $$TARGET..."; \ + go build -o $(BINARY) ./cmd/scrapeycli; \ + touch "$(BUILD_STAMP)"; \ + echo "Done with building."; \ + else \ + echo "$(SKIP_MSG) $$TARGET."; \ + fi + +# ------------------------------------------------------------------------------ +# run: execute the compiled binary. +# Depends on build. +# ------------------------------------------------------------------------------ +run: build + @echo "Running application..." + @$(BINARY) + +# ------------------------------------------------------------------------------ +# tree: displays directory structure (installs tree if missing). +# ------------------------------------------------------------------------------ +tree: + @if ! command -v tree >/dev/null 2>&1; then \ + echo "tree command not found. Attempting to install..."; \ + OS=$$(uname); \ + if [ "$$OS" = "Linux" ]; then \ + sudo apt-get update && sudo apt-get install -y tree; \ + elif [ "$$OS" = "Darwin" ]; then \ + brew install tree; \ + else \ + echo "Automatic installation not supported on $$OS. Please install manually."; \ + exit 1; \ + fi; \ + fi; \ + tree -n -I "vendor|.git" \ No newline at end of file diff --git a/README.md b/README.md index 68686ea..b5fbc4a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ + + # โจ Scrapey CLI -[](https://github.com/heinrichb/scrapey-cli/actions/workflows/ci.yml) +[](https://github.com/heinrichb/scrapey-cli/actions/workflows/ci.yml) +[](https://codecov.io/gh/heinrichb/scrapey-cli) [](https://pkg.go.dev/github.com/heinrichb/scrapey-cli) -[](https://example.com/coverage) Scrapey CLI is a lightweight, configurable web crawler and scraper. It collects data from websites based on rules defined in a config file. It can handle HTML parsing, data extraction, and plans to offer multiple storage options (JSON, XML, Excel, databases, etc.). @@ -10,28 +12,59 @@ Scrapey CLI is a lightweight, configurable web crawler and scraper. It collects ## ๐ Features -- Lightweight and modular CLI interface -- Configurable input (`.json` config file or command-line flags) -- Extensible parsing logic for targeted HTML elements -- Future support for multiple storage options (JSON, XML, Excel, MongoDB, MySQL) -- DRY and clean code principles +- **Lightweight & Modular CLI:** Built with clean, DRY code principles. +- **Configurable Input:** Accepts configuration via a JSON file or command-line flags. +- **Extensible Parsing:** Customizable HTML parsing logic. +- **Planned Storage Options:** Future support for multiple output formats including JSON, XML, Excel, MongoDB, MySQL. --- ## ๐ฑ Getting Started -1. **Clone the repo**: - git clone https://github.com/heinrichb/scrapey-cli.git +1. **Clone the Repo** + + git clone https://github.com/heinrichb/scrapey-cli.git + +2. **Initialize Go Modules & Build the CLI** + + - **Option 1:** Using the Makefile (recommended) + + make build + + - This command runs `go mod tidy` and then builds the binary into the `build` folder. + + - **Option 2:** Directly via Go + + go mod tidy + go build -o build/scrapeycli ./cmd/scrapeycli + +3. **Run the CLI** + + - **Direct Execution:** + + ./build/scrapeycli --config configs/default.json + + - **Using the Makefile:** + + The Makefile provides a `run` target which allows you to pass in optional variables: + + - **Default Run:** + + make run + + - This uses the default configuration file (`configs/default.json`). -2. **Initialize Go modules**: - cd scrapey-cli - go mod tidy + - **Override Config:** -3. **Build the CLI**: - go build ./cmd/scrapeycli + make run CONFIG=configs/other.json -4. **Run**: - ./scrapeycli --config configs/default.json + - **Pass a URL:** + + make run URL=https://example.org + + - **Combined:** + + make run CONFIG=configs/other.json URL=https://example.org --- @@ -39,66 +72,192 @@ Scrapey CLI is a lightweight, configurable web crawler and scraper. It collects ``` scrapey-cli/ +โโโ .github/ +โ โโโ workflows/ +โ โโโ ci.yml +โโโ .vscode/ +โ โโโ settings.json # VS Code settings (format on save for Go) โโโ cmd/ -โ โโโ scrapeycli/ -โ โโโ main.go # CLI entry point -โโโ pkg/ -โ โโโ config/ # Config loading logic -โ โโโ crawler/ # Core web crawling logic -โ โโโ parser/ # HTML parsing logic -โ โโโ storage/ # JSON/other storage logic +โ โโโ scrapeycli/ +โ โโโ main.go โโโ configs/ -โ โโโ default.json # Example config -โโโ .github/ -โ โโโ workflows/ -โ โโโ ci.yml # CI/CD pipeline config -โโโ docs/ # Additional documentation -โโโ build/ # Build scripts, Dockerfiles, etc. -โโโ test/ # Optional integration tests -โโโ README.md # This file +โ โโโ default.json # Default/example configuration file +โโโ pkg/ +โ โโโ config/ +โ โ โโโ config.go # Config loading logic +โ โโโ crawler/ +โ โ โโโ crawler.go # Core web crawling logic +โ โโโ parser/ +โ โ โโโ parser.go # HTML parsing logic +โ โโโ storage/ +โ โ โโโ storage.go # Storage logic +โ โโโ utils/ +โ โโโ printcolor.go # Colorized terminal output utility +โ โโโ printstruct.go # Utility for printing non-empty struct fields +โโโ scripts/ +โ โโโ coverage_formatter.go # Formats and colorizes Go test coverage output +โโโ test/ # Optional integration tests +โ โโโ fail_test.go # Test case designed to always fail, used to debug test output +โโโ .gitignore +โโโ LICENSE # MIT License file +โโโ Makefile # Build & run script for CLI (includes targets for build, run, and test) +โโโ go.mod +โโโ go.sum +โโโ README.md ``` --- +## ๐ง Configuration Options + +Scrapey CLI is configured using a JSON file that defines how websites are crawled and scraped. Below is a detailed breakdown of the available configuration options. + +### ๐ URL Configuration + +```json +"url": { + "base": "https://example.com", + "routes": [ + "/route1", + "/route2", + "*" + ], + "includeBase": false +} +``` + +- **base**: The primary domain to scrape. +- **routes**: List of specific paths to scrape. Supports `*` as a wildcard for full site crawling. +- **includeBase**: Whether to include the base URL in the scrape. + +### ๐ Parsing Rules + +```json +"parseRules": { + "title": "title", + "metaDescription": "meta[name='description']", + "articleContent": "article", + "author": ".author-name", + "datePublished": "meta[property='article:published_time']" +} +``` + +- **title**: Extracts the page title. +- **metaDescription**: Extracts the meta description. +- **articleContent**: Defines the main article section. +- **author**: Selector for extracting author names. +- **datePublished**: Extracts the publication date from meta properties. + +### ๐พ Storage Options + +```json +"storage": { + "outputFormats": ["json", "csv", "xml"], + "savePath": "output/", + "fileName": "scraped_data" +} +``` + +- **outputFormats**: List of formats in which data will be stored. +- **savePath**: Directory where scraped content is saved. +- **fileName**: Base name for output files. + +### โก Scraping Behavior + +```json +"scrapingOptions": { + "maxDepth": 2, + "rateLimit": 1.5, + "retryAttempts": 3, + "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" +} +``` + +- **maxDepth**: Defines how deep the scraper should follow links. +- **rateLimit**: Time delay (in seconds) between requests to avoid rate-limiting. +- **retryAttempts**: Number of retries for failed requests. +- **userAgent**: Custom user-agent string to mimic a browser. + +### ๐ Data Formatting + +```json +"dataFormatting": { + "cleanWhitespace": true, + "removeHTML": true +} +``` + +- **cleanWhitespace**: Removes unnecessary whitespace in extracted content. +- **removeHTML**: Strips HTML tags from extracted content for cleaner output. + +This configuration file allows fine-tuning of scraping behavior, data extraction, and storage formats for ultimate flexibility in web scraping. + +--- + ## ๐ Usage -- **Basic**: - ./scrapeycli --url https://example.com +- **Basic Execution:** -- **With config file**: - ./scrapeycli --config configs/default.json + ./build/scrapeycli --url https://example.com -- **Future**: - - Save data to JSON - - Multiple URLs at once - - Concurrency and rate-limiting +- **With a Config File:** + + ./build/scrapeycli --config configs/default.json + +- **Using the Makefile:** + + - Run with defaults: + + make run + + - Override configuration and/or URL: + + make run CONFIG=configs/other.json URL=https://example.org + +- **Future Enhancements:** + - Save scraped data to JSON. + - Support for scraping multiple URLs simultaneously. + - Concurrency and rate-limiting. --- ## ๐งช Tests -- Run unit tests locally: - go test ./... +- **Run Unit Tests Locally:** + To run tests for all modules and the test folder (if it exists), use: + + make test -- Automated tests on GitHub Actions: - - Triggered on every push and pull request to main or develop branches. + This command first runs "go test ./..." to execute tests in all packages, and then, if the "test" folder exists and contains Go files, it will run tests in that folder as well. + +- **Automated Tests on GitHub Actions:** + - Tests are triggered on every push and pull request to the "main" or "develop" branches. - See Build & Test (https://github.com/heinrichb/scrapey-cli/actions) for logs and results. --- ## ๐ค Contributing -1. Fork the project -2. Create your feature branch (git checkout -b feature/amazing-feature) -3. Commit your changes (git commit -m 'Add some amazing feature') -4. Push to the branch (git push origin feature/amazing-feature) -5. Open a Pull Request +1. Fork the project. +2. Create your feature branch: + + git checkout -b feature/amazing-feature + +3. Commit your changes: + + git commit -m 'Add some amazing feature' + +4. Push to the branch: + + git push origin feature/amazing-feature + +5. Open a Pull Request. --- ## ๐ License -This project is licensed under the MIT License (LICENSE). +This project is licensed under the MIT License ([LICENSE](LICENSE)). --- diff --git a/cmd/scrapeycli/main.go b/cmd/scrapeycli/main.go index 1cec306..e456373 100644 --- a/cmd/scrapeycli/main.go +++ b/cmd/scrapeycli/main.go @@ -1,58 +1,137 @@ -package main - -import ( - "flag" - "fmt" - "os" - - "github.com/fatih/color" - "github.com/heinrichb/scrapey-cli/pkg/config" -) - -var ( - configPath string - url string -) - -func init() { - // Two flags for config (-config and -c) so user can choose either - flag.StringVar(&configPath, "config", "", "Path to config file") - flag.StringVar(&configPath, "c", "", "Path to config file (shorthand)") - - // Flag for URL - flag.StringVar(&url, "url", "", "URL to scrape (overrides config)") -} - -func main() { - // Parse CLI flags - flag.Parse() - - // Print a colored welcome message - color.Cyan("Welcome to Scrapey CLI!") - - // If no config path is provided, we can optionally default to something: - if configPath == "" { - configPath = "configs/default.json" - } - - // Show a colored message for loading config - fmt.Printf("%s%s\n", color.New(color.FgHiYellow).Sprint("Loading config from: "), configPath) - // Attempt to load config - cfg, err := config.Load(configPath) - if err != nil { - color.Red("Failed to load config: %v", err) - os.Exit(1) - } - - // If user supplied a URL, override config - if url != "" { - fmt.Printf("%s%s\n", color.New(color.FgHiMagenta).Sprint("Overriding config with URL flag: "), url) - cfg.URL = url - } - - // Another colored message to confirm it's loaded - fmt.Printf("%s%s\n", color.New(color.FgHiGreen).Sprint("Loaded config from: "), configPath) - - // Indicate successful finish for now - color.Green("Scrapey CLI initialization complete.") -} +package main + +import ( + "flag" + "os" + + "github.com/fatih/color" + "github.com/heinrichb/scrapey-cli/pkg/config" + "github.com/heinrichb/scrapey-cli/pkg/utils" +) + +/* +Global variables for storing command-line arguments. + +- configPath: The path to the configuration file. +- url: The URL to be scraped, which may override the URL in the config. +- maxDepth: Overrides the scraping depth if set. +- rateLimit: Overrides the request rate limit. +- verbose: Enables verbose output. +*/ +var ( + configPath string + url string + maxDepth int + rateLimit float64 + verbose bool +) + +/* +init registers command-line flags for configuration. + +It sets up flags for: +- The config file ("config" and its shorthand "c"). +- URL override. +- Scraping depth override. +- Rate limit override. +- Verbose output ("verbose" and its shorthand "v"). +*/ +func init() { + flag.StringVar(&configPath, "config", "", "Path to config file") + flag.StringVar(&configPath, "c", "", "Path to config file (shorthand)") + flag.StringVar(&url, "url", "", "URL to scrape (overrides config)") + flag.IntVar(&maxDepth, "maxDepth", 0, "Override max crawl depth") + flag.Float64Var(&rateLimit, "rateLimit", 0, "Override request rate limit (seconds)") + flag.BoolVar(&verbose, "verbose", false, "Enable verbose output") + flag.BoolVar(&verbose, "v", false, "Enable verbose output (shorthand)") +} + +// Helper functions to create pointers for literal values. +func ptrString(s string) *string { return &s } +func ptrInt(i int) *int { return &i } +func ptrFloat64(f float64) *float64 { return &f } + +/* +main is the entry point of Scrapey CLI. + +It parses command-line flags, prints a welcome message, loads the configuration, +applies CLI overrides using a ConfigOverride object, and prints confirmation messages. +*/ +func main() { + // Parse CLI flags. + flag.Parse() + + // Store the verbose flag in global state. + config.Verbose = verbose + + // Print a welcome message in cyan using our PrintColored utility. + utils.PrintColored("Welcome to Scrapey CLI!", "", color.FgCyan) + + // Default to "configs/default.json" if no config path is provided. + if configPath == "" { + configPath = "configs/default.json" + } + + // Attempt to load the configuration from the specified file. + cfg, err := config.Load(configPath) + if err != nil { + // If loading fails, print an error message in red and exit. + utils.PrintColored("Failed to load config: ", err.Error(), color.FgRed) + os.Exit(1) + } + + // Construct a partial ConfigOverride struct for CLI overrides. + cliOverrides := config.ConfigOverride{} + + // Apply URL override if provided. + if url != "" { + cliOverrides.URL = &struct { + Base *string `json:"base"` + Routes *[]string `json:"routes"` + IncludeBase *bool `json:"includeBase"` + }{ + Base: ptrString(url), + } + } + + // Apply maxDepth override if provided. + if maxDepth > 0 { + if cliOverrides.ScrapingOptions == nil { + cliOverrides.ScrapingOptions = &struct { + MaxDepth *int `json:"maxDepth"` + RateLimit *float64 `json:"rateLimit"` + RetryAttempts *int `json:"retryAttempts"` + UserAgent *string `json:"userAgent"` + }{} + } + cliOverrides.ScrapingOptions.MaxDepth = ptrInt(maxDepth) + } + + // Apply rateLimit override if provided. + if rateLimit > 0 { + if cliOverrides.ScrapingOptions == nil { + cliOverrides.ScrapingOptions = &struct { + MaxDepth *int `json:"maxDepth"` + RateLimit *float64 `json:"rateLimit"` + RetryAttempts *int `json:"retryAttempts"` + UserAgent *string `json:"userAgent"` + }{} + } + cliOverrides.ScrapingOptions.RateLimit = ptrFloat64(rateLimit) + } + + // Apply all CLI overrides dynamically. + cfg.OverrideConfig(cliOverrides) + + // Print confirmation of loaded config. + utils.PrintColored("Scrapey CLI initialization complete.", "", color.FgGreen) + + // Print which routes will be scraped. + utils.PrintColored("Base URL: ", cfg.URL.Base, color.FgYellow) + if cfg.URL.IncludeBase { + utils.PrintColored("Including base URL in scraping.", "", color.FgGreen) + } + for _, route := range cfg.URL.Routes { + utils.PrintColored("Scraping route: ", route, color.FgHiBlue) + } +} diff --git a/configs/default.json b/configs/default.json index ad43486..f1298dc 100644 --- a/configs/default.json +++ b/configs/default.json @@ -1,7 +1,30 @@ { - "url": "https://example.com", + "version": "1.0", + "url": { + "base": "https://example.com", + "routes": ["/route1", "/route2", "*"], + "includeBase": false + }, "parseRules": { "title": "title", - "metaDescription": "meta[name='description']" + "metaDescription": "meta[name='description']", + "articleContent": "article", + "author": ".author-name", + "datePublished": "meta[property='article:published_time']" + }, + "storage": { + "outputFormats": ["json", "csv", "xml"], + "savePath": "output/", + "fileName": "scraped_data" + }, + "scrapingOptions": { + "maxDepth": 2, + "rateLimit": 1.5, + "retryAttempts": 3, + "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + }, + "dataFormatting": { + "cleanWhitespace": true, + "removeHTML": true } } diff --git a/go.mod b/go.mod index 87ee416..100f748 100644 --- a/go.mod +++ b/go.mod @@ -1,9 +1,14 @@ +// File: go.mod module github.com/heinrichb/scrapey-cli go 1.23.4 require ( - github.com/fatih/color v1.18.0 // indirect + bou.ke/monkey v1.0.2 + github.com/fatih/color v1.18.0 +) + +require ( github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect golang.org/x/sys v0.25.0 // indirect diff --git a/go.sum b/go.sum index 33148a4..a1b828d 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +bou.ke/monkey v1.0.2 h1:kWcnsrCNUatbxncxR/ThdYqbytgOIArtYWqcQLQzKLI= +bou.ke/monkey v1.0.2/go.mod h1:OqickVX3tNx6t33n1xvtTtu85YN5s6cKwVug+oHMaIA= github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= diff --git a/pkg/config/config.go b/pkg/config/config.go index f1ecac6..6ee0820 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -1,16 +1,316 @@ -package config - -// Config is a basic struct to hold config data. -type Config struct { - URL string `json:"url,omitempty"` - // Add more fields as needed for your use case -} - -// Load reads config from filePath and returns a Config struct. -// Currently just returns a placeholder. -func Load(filePath string) (*Config, error) { - // In future, parse JSON from filePath - return &Config{ - URL: "http://example.com", // Stub data - }, nil -} +package config + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/fatih/color" + "github.com/heinrichb/scrapey-cli/pkg/utils" +) + +/* +Global Verbose flag. + +This flag determines whether verbose output is enabled. +It is set in `main.go` and used throughout the application. +*/ +var Verbose bool + +/* +Config holds configuration data used by Scrapey CLI. + +Fields: + - URL: A struct containing the base URL and routes to scrape. + - ParseRules: A struct containing parsing rules. + - Storage: A struct defining how data is saved. + - ScrapingOptions: Settings for crawling behavior. + - DataFormatting: Options for cleaning extracted content. + +Usage: + + The configuration is loaded from a JSON file to guide the crawler and parser. +*/ +type Config struct { + Version string `json:"version"` + URL struct { + Base string `json:"base"` + Routes []string `json:"routes"` + IncludeBase bool `json:"includeBase"` + } `json:"url"` + ParseRules struct { + Title string `json:"title,omitempty"` + MetaDescription string `json:"metaDescription,omitempty"` + ArticleContent string `json:"articleContent,omitempty"` + Author string `json:"author,omitempty"` + DatePublished string `json:"datePublished,omitempty"` + } `json:"parseRules"` + Storage struct { + OutputFormats []string `json:"outputFormats"` + SavePath string `json:"savePath"` + FileName string `json:"fileName"` + } `json:"storage"` + ScrapingOptions struct { + MaxDepth int `json:"maxDepth"` + RateLimit float64 `json:"rateLimit"` + RetryAttempts int `json:"retryAttempts"` + UserAgent string `json:"userAgent"` + } `json:"scrapingOptions"` + DataFormatting struct { + CleanWhitespace bool `json:"cleanWhitespace"` + RemoveHTML bool `json:"removeHTML"` + } `json:"dataFormatting"` +} + +/* +ConfigOverride represents a partial configuration used for overriding values. +All fields are pointers, so that nil indicates "no override" while a non-nil value, +even if zero, is used to override the corresponding Config field. +*/ +type ConfigOverride struct { + Version *string `json:"version"` + URL *struct { + Base *string `json:"base"` + Routes *[]string `json:"routes"` + IncludeBase *bool `json:"includeBase"` + } `json:"url"` + ParseRules *struct { + Title *string `json:"title,omitempty"` + MetaDescription *string `json:"metaDescription,omitempty"` + ArticleContent *string `json:"articleContent,omitempty"` + Author *string `json:"author,omitempty"` + DatePublished *string `json:"datePublished,omitempty"` + } `json:"parseRules"` + Storage *struct { + OutputFormats *[]string `json:"outputFormats"` + SavePath *string `json:"savePath"` + FileName *string `json:"fileName"` + } `json:"storage"` + ScrapingOptions *struct { + MaxDepth *int `json:"maxDepth"` + RateLimit *float64 `json:"rateLimit"` + RetryAttempts *int `json:"retryAttempts"` + UserAgent *string `json:"userAgent"` + } `json:"scrapingOptions"` + DataFormatting *struct { + CleanWhitespace *bool `json:"cleanWhitespace"` + RemoveHTML *bool `json:"removeHTML"` + } `json:"dataFormatting"` +} + +/* +ApplyDefaults populates missing fields in the Config struct with default values. + +Usage: + + cfg.ApplyDefaults() + +Notes: + - Ensures that a missing Base URL defaults to "https://example.com". + - Sets default scraping and storage parameters. + - Provides a sensible fallback for all configurable values. +*/ +func (cfg *Config) ApplyDefaults() { + if cfg.URL.Base == "" { + cfg.URL.Base = "https://example.com" + } + if len(cfg.URL.Routes) == 0 { + cfg.URL.Routes = []string{"/"} + } + if cfg.ScrapingOptions.MaxDepth == 0 { + cfg.ScrapingOptions.MaxDepth = 2 + } + if cfg.ScrapingOptions.RateLimit == 0 { + cfg.ScrapingOptions.RateLimit = 1.5 + } + if cfg.ScrapingOptions.RetryAttempts == 0 { + cfg.ScrapingOptions.RetryAttempts = 3 + } + if cfg.ScrapingOptions.UserAgent == "" { + cfg.ScrapingOptions.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + if len(cfg.Storage.OutputFormats) == 0 { + cfg.Storage.OutputFormats = []string{"json"} + } + if cfg.Storage.SavePath == "" { + cfg.Storage.SavePath = "output/" + } + if cfg.Storage.FileName == "" { + cfg.Storage.FileName = "scraped_data" + } +} + +/* +Load reads configuration data from the specified filePath. + +Parameters: + - filePath: The path to the JSON configuration file. + +Returns: + - A pointer to a Config struct containing the parsed configuration. + - An error if the file does not exist, cannot be read, or if the JSON is invalid. + +Usage: + + cfg, err := Load("configs/default.json") + if err != nil { + // Handle error + } + // Use cfg to configure the application. +*/ +func Load(filePath string) (*Config, error) { + if _, err := os.Stat(filePath); os.IsNotExist(err) { + return nil, fmt.Errorf("config file %s does not exist", filePath) + } + + utils.PrintColored("Loaded config from: ", filePath, color.FgHiGreen) + + content, err := os.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("failed to read config file: %v", err) + } + + var cfg Config + if err := json.Unmarshal(content, &cfg); err != nil { + return nil, fmt.Errorf("invalid JSON in config file: %v", err) + } + + // Apply default values where necessary. + cfg.ApplyDefaults() + + // **Verbose Mode: Print Non-Empty Fields** + if Verbose { + utils.PrintNonEmptyFields("", cfg) + } + + return &cfg, nil +} + +/* +OverrideConfig applies values from the provided `overrides` object to the existing configuration. +Only fields with non-nil pointers in the overrides object are applied; all other fields remain unchanged. + +Parameters: + - overrides: A ConfigOverride struct containing only the fields to override. + A nil pointer indicates that no override should occur for that field. + +Usage: + + cfg.OverrideConfig(ConfigOverride{ + URL: &struct { + Base *string `json:"base"` + Routes *[]string `json:"routes"` + IncludeBase *bool `json:"includeBase"` + }{ + Base: ptrString("https://example.org"), + }, + ScrapingOptions: &struct { + MaxDepth *int `json:"maxDepth"` + RateLimit *float64 `json:"rateLimit"` + RetryAttempts *int `json:"retryAttempts"` + UserAgent *string `json:"userAgent"` + }{ + MaxDepth: ptrInt(5), + }, + }) + +Notes: + - Only fields with non-nil pointers in `overrides` are applied. + - This allows partial configuration overrides without unintentionally overwriting existing values. + - Both struct and non-struct fields are overridden if provided. +*/ +func (cfg *Config) OverrideConfig(overrides ConfigOverride) { + // Override non-struct field: Version. + if overrides.Version != nil { + utils.PrintColored("Overriding Version: ", *overrides.Version, color.FgHiMagenta) + cfg.Version = *overrides.Version + } + + // Override URL fields. + if overrides.URL != nil { + if overrides.URL.Base != nil { + utils.PrintColored("Overriding URL.Base: ", *overrides.URL.Base, color.FgHiMagenta) + cfg.URL.Base = *overrides.URL.Base + } + if overrides.URL.Routes != nil { + utils.PrintColored("Overriding URL.Routes: ", fmt.Sprint(*overrides.URL.Routes), color.FgHiMagenta) + cfg.URL.Routes = *overrides.URL.Routes + } + if overrides.URL.IncludeBase != nil { + utils.PrintColored("Overriding URL.IncludeBase: ", fmt.Sprint(*overrides.URL.IncludeBase), color.FgHiMagenta) + cfg.URL.IncludeBase = *overrides.URL.IncludeBase + } + } + + // Override ParseRules fields. + if overrides.ParseRules != nil { + if overrides.ParseRules.Title != nil { + utils.PrintColored("Overriding ParseRules.Title: ", *overrides.ParseRules.Title, color.FgHiMagenta) + cfg.ParseRules.Title = *overrides.ParseRules.Title + } + if overrides.ParseRules.MetaDescription != nil { + utils.PrintColored("Overriding ParseRules.MetaDescription: ", *overrides.ParseRules.MetaDescription, color.FgHiMagenta) + cfg.ParseRules.MetaDescription = *overrides.ParseRules.MetaDescription + } + if overrides.ParseRules.ArticleContent != nil { + utils.PrintColored("Overriding ParseRules.ArticleContent: ", *overrides.ParseRules.ArticleContent, color.FgHiMagenta) + cfg.ParseRules.ArticleContent = *overrides.ParseRules.ArticleContent + } + if overrides.ParseRules.Author != nil { + utils.PrintColored("Overriding ParseRules.Author: ", *overrides.ParseRules.Author, color.FgHiMagenta) + cfg.ParseRules.Author = *overrides.ParseRules.Author + } + if overrides.ParseRules.DatePublished != nil { + utils.PrintColored("Overriding ParseRules.DatePublished: ", *overrides.ParseRules.DatePublished, color.FgHiMagenta) + cfg.ParseRules.DatePublished = *overrides.ParseRules.DatePublished + } + } + + // Override Storage fields. + if overrides.Storage != nil { + if overrides.Storage.OutputFormats != nil { + utils.PrintColored("Overriding Storage.OutputFormats: ", fmt.Sprint(*overrides.Storage.OutputFormats), color.FgHiMagenta) + cfg.Storage.OutputFormats = *overrides.Storage.OutputFormats + } + if overrides.Storage.SavePath != nil { + utils.PrintColored("Overriding Storage.SavePath: ", *overrides.Storage.SavePath, color.FgHiMagenta) + cfg.Storage.SavePath = *overrides.Storage.SavePath + } + if overrides.Storage.FileName != nil { + utils.PrintColored("Overriding Storage.FileName: ", *overrides.Storage.FileName, color.FgHiMagenta) + cfg.Storage.FileName = *overrides.Storage.FileName + } + } + + // Override ScrapingOptions fields. + if overrides.ScrapingOptions != nil { + if overrides.ScrapingOptions.MaxDepth != nil { + utils.PrintColored("Overriding ScrapingOptions.MaxDepth: ", fmt.Sprint(*overrides.ScrapingOptions.MaxDepth), color.FgHiMagenta) + cfg.ScrapingOptions.MaxDepth = *overrides.ScrapingOptions.MaxDepth + } + if overrides.ScrapingOptions.RateLimit != nil { + utils.PrintColored("Overriding ScrapingOptions.RateLimit: ", fmt.Sprint(*overrides.ScrapingOptions.RateLimit), color.FgHiMagenta) + cfg.ScrapingOptions.RateLimit = *overrides.ScrapingOptions.RateLimit + } + if overrides.ScrapingOptions.RetryAttempts != nil { + utils.PrintColored("Overriding ScrapingOptions.RetryAttempts: ", fmt.Sprint(*overrides.ScrapingOptions.RetryAttempts), color.FgHiMagenta) + cfg.ScrapingOptions.RetryAttempts = *overrides.ScrapingOptions.RetryAttempts + } + if overrides.ScrapingOptions.UserAgent != nil { + utils.PrintColored("Overriding ScrapingOptions.UserAgent: ", *overrides.ScrapingOptions.UserAgent, color.FgHiMagenta) + cfg.ScrapingOptions.UserAgent = *overrides.ScrapingOptions.UserAgent + } + } + + // Override DataFormatting fields. + if overrides.DataFormatting != nil { + if overrides.DataFormatting.CleanWhitespace != nil { + utils.PrintColored("Overriding DataFormatting.CleanWhitespace: ", fmt.Sprint(*overrides.DataFormatting.CleanWhitespace), color.FgHiMagenta) + cfg.DataFormatting.CleanWhitespace = *overrides.DataFormatting.CleanWhitespace + } + if overrides.DataFormatting.RemoveHTML != nil { + utils.PrintColored("Overriding DataFormatting.RemoveHTML: ", fmt.Sprint(*overrides.DataFormatting.RemoveHTML), color.FgHiMagenta) + cfg.DataFormatting.RemoveHTML = *overrides.DataFormatting.RemoveHTML + } + } +} diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go new file mode 100644 index 0000000..35124d1 --- /dev/null +++ b/pkg/config/config_test.go @@ -0,0 +1,486 @@ +// File: pkg/config/config_test.go + +package config + +import ( + "fmt" + "os" + "reflect" + "strings" + "testing" + + "bou.ke/monkey" + "github.com/heinrichb/scrapey-cli/pkg/utils" +) + +// Helper functions to easily create pointer values. +func ptrString(s string) *string { return &s } +func ptrInt(i int) *int { return &i } +func ptrFloat64(f float64) *float64 { return &f } +func ptrBool(b bool) *bool { return &b } + +// TestApplyDefaults tests the ApplyDefaults function to ensure that missing fields are set to default values. +func TestApplyDefaults(t *testing.T) { + cases := []struct { + desc string + setup func(cfg *Config) + validate func(t *testing.T, cfg *Config) + }{ + { + desc: "All fields missing should be set to defaults", + setup: func(cfg *Config) {}, + validate: func(t *testing.T, cfg *Config) { + if cfg.URL.Base != "https://example.com" { + t.Errorf("Expected URL.Base to be 'https://example.com', got '%s'", cfg.URL.Base) + } + if len(cfg.URL.Routes) != 1 || cfg.URL.Routes[0] != "/" { + t.Errorf("Expected URL.Routes to be ['/'], got %v", cfg.URL.Routes) + } + if cfg.ScrapingOptions.MaxDepth != 2 { + t.Errorf("Expected ScrapingOptions.MaxDepth to be 2, got %d", cfg.ScrapingOptions.MaxDepth) + } + if cfg.ScrapingOptions.RateLimit != 1.5 { + t.Errorf("Expected ScrapingOptions.RateLimit to be 1.5, got %f", cfg.ScrapingOptions.RateLimit) + } + if cfg.ScrapingOptions.RetryAttempts != 3 { + t.Errorf("Expected ScrapingOptions.RetryAttempts to be 3, got %d", cfg.ScrapingOptions.RetryAttempts) + } + expectedUA := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + if cfg.ScrapingOptions.UserAgent != expectedUA { + t.Errorf("Expected ScrapingOptions.UserAgent to be '%s', got '%s'", expectedUA, cfg.ScrapingOptions.UserAgent) + } + if len(cfg.Storage.OutputFormats) != 1 || cfg.Storage.OutputFormats[0] != "json" { + t.Errorf("Expected Storage.OutputFormats to be ['json'], got %v", cfg.Storage.OutputFormats) + } + if cfg.Storage.SavePath != "output/" { + t.Errorf("Expected Storage.SavePath to be 'output/', got '%s'", cfg.Storage.SavePath) + } + if cfg.Storage.FileName != "scraped_data" { + t.Errorf("Expected Storage.FileName to be 'scraped_data', got '%s'", cfg.Storage.FileName) + } + }, + }, + { + desc: "Pre-set fields remain unchanged and missing fields get defaults", + setup: func(cfg *Config) { + cfg.URL.Base = "https://preset.com" + cfg.Storage.SavePath = "custom_output/" + }, + validate: func(t *testing.T, cfg *Config) { + if cfg.URL.Base != "https://preset.com" { + t.Errorf("Expected URL.Base to be 'https://preset.com', got '%s'", cfg.URL.Base) + } + if cfg.Storage.SavePath != "custom_output/" { + t.Errorf("Expected Storage.SavePath to be 'custom_output/', got '%s'", cfg.Storage.SavePath) + } + if len(cfg.URL.Routes) != 1 || cfg.URL.Routes[0] != "/" { + t.Errorf("Expected URL.Routes to be ['/'], got %v", cfg.URL.Routes) + } + if cfg.ScrapingOptions.MaxDepth != 2 { + t.Errorf("Expected ScrapingOptions.MaxDepth to be 2, got %d", cfg.ScrapingOptions.MaxDepth) + } + if len(cfg.Storage.OutputFormats) != 1 || cfg.Storage.OutputFormats[0] != "json" { + t.Errorf("Expected Storage.OutputFormats to be ['json'], got %v", cfg.Storage.OutputFormats) + } + if cfg.Storage.FileName != "scraped_data" { + t.Errorf("Expected Storage.FileName to be 'scraped_data', got '%s'", cfg.Storage.FileName) + } + }, + }, + { + desc: "No change if all fields are pre-set", + setup: func(cfg *Config) { + cfg.URL.Base = "https://preset.com" + cfg.URL.Routes = []string{"/preset"} + cfg.ScrapingOptions.MaxDepth = 10 + cfg.ScrapingOptions.RateLimit = 3.0 + cfg.ScrapingOptions.RetryAttempts = 5 + cfg.ScrapingOptions.UserAgent = "CustomAgent" + cfg.Storage.OutputFormats = []string{"xml"} + cfg.Storage.SavePath = "preset_output/" + cfg.Storage.FileName = "preset_data" + }, + validate: func(t *testing.T, cfg *Config) { + if cfg.URL.Base != "https://preset.com" { + t.Errorf("Expected URL.Base to be 'https://preset.com', got '%s'", cfg.URL.Base) + } + if !reflect.DeepEqual(cfg.URL.Routes, []string{"/preset"}) { + t.Errorf("Expected URL.Routes to be ['/preset'], got %v", cfg.URL.Routes) + } + if cfg.ScrapingOptions.MaxDepth != 10 { + t.Errorf("Expected ScrapingOptions.MaxDepth to be 10, got %d", cfg.ScrapingOptions.MaxDepth) + } + if cfg.ScrapingOptions.RateLimit != 3.0 { + t.Errorf("Expected ScrapingOptions.RateLimit to be 3.0, got %f", cfg.ScrapingOptions.RateLimit) + } + if cfg.ScrapingOptions.RetryAttempts != 5 { + t.Errorf("Expected ScrapingOptions.RetryAttempts to be 5, got %d", cfg.ScrapingOptions.RetryAttempts) + } + if cfg.ScrapingOptions.UserAgent != "CustomAgent" { + t.Errorf("Expected ScrapingOptions.UserAgent to be 'CustomAgent', got '%s'", cfg.ScrapingOptions.UserAgent) + } + if !reflect.DeepEqual(cfg.Storage.OutputFormats, []string{"xml"}) { + t.Errorf("Expected Storage.OutputFormats to be ['xml'], got %v", cfg.Storage.OutputFormats) + } + if cfg.Storage.SavePath != "preset_output/" { + t.Errorf("Expected Storage.SavePath to be 'preset_output/', got '%s'", cfg.Storage.SavePath) + } + if cfg.Storage.FileName != "preset_data" { + t.Errorf("Expected Storage.FileName to be 'preset_data', got '%s'", cfg.Storage.FileName) + } + }, + }, + } + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + cfg := &Config{} + if tc.setup != nil { + tc.setup(cfg) + } + cfg.ApplyDefaults() + tc.validate(t, cfg) + }) + } +} + +// TestLoad tests the Load function with various file conditions. +func TestLoad(t *testing.T) { + var capturedColored string + patchColored := monkey.Patch(utils.PrintColored, func(a ...interface{}) { + capturedColored += fmt.Sprint(a...) + }) + defer patchColored.Unpatch() + + var capturedNonEmpty string + patchNonEmpty := monkey.Patch(utils.PrintNonEmptyFields, func(prefix string, cfg interface{}) { + capturedNonEmpty += "nonEmptyFieldsCalled" + }) + defer patchNonEmpty.Unpatch() + + cases := []struct { + desc string + fileSetup func(fileName string) + verbose bool + expectErr bool + checkOutput func(t *testing.T, colored, nonEmpty string) + }{ + { + desc: "Missing config file", + fileSetup: nil, + verbose: false, + expectErr: true, + checkOutput: func(t *testing.T, colored, nonEmpty string) { + if colored != "" { + t.Errorf("Expected no colored output for missing file, got: %s", colored) + } + }, + }, + { + desc: "Unreadable config file", + fileSetup: func(name string) { + if err := os.WriteFile(name, []byte(`{"url": {"base": "http://example.org"}}`), 0644); err != nil { + t.Fatalf("Failed to write file: %v", err) + } + }, + verbose: false, + expectErr: true, + checkOutput: func(t *testing.T, colored, nonEmpty string) { + if !strings.Contains(colored, "Loaded config from: ") { + t.Errorf("Expected colored output, got: %s", colored) + } + }, + }, + { + desc: "Invalid JSON format", + fileSetup: func(name string) { + if err := os.WriteFile(name, []byte(`{"url": {"base": "http://example.org"`), 0644); err != nil { + t.Fatalf("Failed to write file: %v", err) + } + }, + verbose: false, + expectErr: true, + checkOutput: func(t *testing.T, colored, nonEmpty string) { + if !strings.Contains(colored, "Loaded config from: ") { + t.Errorf("Expected colored output, got: %s", colored) + } + }, + }, + { + desc: "Valid JSON without verbose mode", + fileSetup: func(name string) { + if err := os.WriteFile(name, []byte(`{"url": {"base": "http://example.org"}}`), 0644); err != nil { + t.Fatalf("Failed to write file: %v", err) + } + }, + verbose: false, + expectErr: false, + checkOutput: func(t *testing.T, colored, nonEmpty string) { + if !strings.Contains(colored, "Loaded config from: ") { + t.Errorf("Expected colored output, got: %s", colored) + } + if nonEmpty != "" { + t.Errorf("Expected no non-empty output when verbose is false, got: %s", nonEmpty) + } + }, + }, + { + desc: "Valid JSON with verbose mode", + fileSetup: func(name string) { + if err := os.WriteFile(name, []byte(`{"url": {"base": "http://example.org"}}`), 0644); err != nil { + t.Fatalf("Failed to write file: %v", err) + } + }, + verbose: true, + expectErr: false, + checkOutput: func(t *testing.T, colored, nonEmpty string) { + if !strings.Contains(colored, "Loaded config from: ") { + t.Errorf("Expected colored output, got: %s", colored) + } + if nonEmpty != "nonEmptyFieldsCalled" { + t.Errorf("Expected non-empty output when verbose is true, got: %s", nonEmpty) + } + }, + }, + } + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + capturedColored = "" + patchNonEmpty.Unpatch() + patchNonEmpty = monkey.Patch(utils.PrintNonEmptyFields, func(prefix string, cfg interface{}) { + capturedNonEmpty += "nonEmptyFieldsCalled" + }) + defer patchNonEmpty.Unpatch() + Verbose = tc.verbose + + var fileName string + if tc.fileSetup != nil { + tmpFile, err := os.CreateTemp("", "config_*.json") + if err != nil { + t.Fatalf("Failed to create temp file: %v", err) + } + fileName = tmpFile.Name() + tmpFile.Close() + tc.fileSetup(fileName) + os.Chmod(fileName, 0644) + defer os.Remove(fileName) + } else { + fileName = "nonexistent_config.json" + } + + if tc.desc == "Unreadable config file" { + patchReadFile := monkey.Patch(os.ReadFile, func(name string) ([]byte, error) { + return nil, fmt.Errorf("simulated read error") + }) + defer patchReadFile.Unpatch() + } + + cfg, err := Load(fileName) + if tc.expectErr { + if err == nil { + t.Errorf("Expected error but got nil") + } + return + } else { + if err != nil { + t.Errorf("Unexpected error: %v", err) + return + } + } + if cfg.URL.Base == "" { + t.Errorf("Expected URL.Base to be set, got empty") + } + tc.checkOutput(t, capturedColored, capturedNonEmpty) + }) + } +} + +// TestOverrideConfig combines the previous TestOverrideConfigFull and TestOverrideConfigNil into a single test. +// It verifies that a full override updates all fields and that a nil override leaves the config unchanged. +func TestOverrideConfig(t *testing.T) { + cases := []struct { + desc string + overrideSetup func() ConfigOverride + validate func(t *testing.T, base *Config, captured string) + }{ + { + desc: "Full override applies all changes", + overrideSetup: func() ConfigOverride { + return ConfigOverride{ + Version: ptrString("v2.0"), + URL: &struct { + Base *string `json:"base"` + Routes *[]string `json:"routes"` + IncludeBase *bool `json:"includeBase"` + }{ + Base: ptrString("https://override.com"), + Routes: &[]string{"/new", "/extra"}, + IncludeBase: ptrBool(true), + }, + ParseRules: &struct { + Title *string `json:"title,omitempty"` + MetaDescription *string `json:"metaDescription,omitempty"` + ArticleContent *string `json:"articleContent,omitempty"` + Author *string `json:"author,omitempty"` + DatePublished *string `json:"datePublished,omitempty"` + }{ + Title: ptrString("New Title"), + MetaDescription: ptrString("New Meta"), + ArticleContent: ptrString("New Content"), + Author: ptrString("New Author"), + DatePublished: ptrString("2022-01-01"), + }, + Storage: &struct { + OutputFormats *[]string `json:"outputFormats"` + SavePath *string `json:"savePath"` + FileName *string `json:"fileName"` + }{ + OutputFormats: &[]string{"csv"}, + SavePath: ptrString("new_output/"), + FileName: ptrString("new_data"), + }, + ScrapingOptions: &struct { + MaxDepth *int `json:"maxDepth"` + RateLimit *float64 `json:"rateLimit"` + RetryAttempts *int `json:"retryAttempts"` + UserAgent *string `json:"userAgent"` + }{ + MaxDepth: ptrInt(5), + RateLimit: ptrFloat64(2.0), + RetryAttempts: ptrInt(4), + UserAgent: ptrString("OverrideAgent"), + }, + DataFormatting: &struct { + CleanWhitespace *bool `json:"cleanWhitespace"` + RemoveHTML *bool `json:"removeHTML"` + }{ + CleanWhitespace: ptrBool(true), + RemoveHTML: ptrBool(true), + }, + } + }, + validate: func(t *testing.T, base *Config, captured string) { + if base.Version != "v2.0" { + t.Errorf("Expected Version to be 'v2.0', got '%s'", base.Version) + } + if base.URL.Base != "https://override.com" { + t.Errorf("Expected URL.Base to be 'https://override.com', got '%s'", base.URL.Base) + } + if !reflect.DeepEqual(base.URL.Routes, []string{"/new", "/extra"}) { + t.Errorf("Expected URL.Routes to be ['/new', '/extra'], got %v", base.URL.Routes) + } + if !base.URL.IncludeBase { + t.Errorf("Expected URL.IncludeBase to be true") + } + if base.ParseRules.Title != "New Title" { + t.Errorf("Expected ParseRules.Title to be 'New Title', got '%s'", base.ParseRules.Title) + } + if base.ParseRules.MetaDescription != "New Meta" { + t.Errorf("Expected ParseRules.MetaDescription to be 'New Meta', got '%s'", base.ParseRules.MetaDescription) + } + if base.ParseRules.ArticleContent != "New Content" { + t.Errorf("Expected ParseRules.ArticleContent to be 'New Content', got '%s'", base.ParseRules.ArticleContent) + } + if base.ParseRules.Author != "New Author" { + t.Errorf("Expected ParseRules.Author to be 'New Author', got '%s'", base.ParseRules.Author) + } + if base.ParseRules.DatePublished != "2022-01-01" { + t.Errorf("Expected ParseRules.DatePublished to be '2022-01-01', got '%s'", base.ParseRules.DatePublished) + } + if !reflect.DeepEqual(base.Storage.OutputFormats, []string{"csv"}) { + t.Errorf("Expected Storage.OutputFormats to be ['csv'], got %v", base.Storage.OutputFormats) + } + if base.Storage.SavePath != "new_output/" { + t.Errorf("Expected Storage.SavePath to be 'new_output/', got '%s'", base.Storage.SavePath) + } + if base.Storage.FileName != "new_data" { + t.Errorf("Expected Storage.FileName to be 'new_data', got '%s'", base.Storage.FileName) + } + if base.ScrapingOptions.MaxDepth != 5 { + t.Errorf("Expected ScrapingOptions.MaxDepth to be 5, got %d", base.ScrapingOptions.MaxDepth) + } + if base.ScrapingOptions.RateLimit != 2.0 { + t.Errorf("Expected ScrapingOptions.RateLimit to be 2.0, got %f", base.ScrapingOptions.RateLimit) + } + if base.ScrapingOptions.RetryAttempts != 4 { + t.Errorf("Expected ScrapingOptions.RetryAttempts to be 4, got %d", base.ScrapingOptions.RetryAttempts) + } + if base.ScrapingOptions.UserAgent != "OverrideAgent" { + t.Errorf("Expected ScrapingOptions.UserAgent to be 'OverrideAgent', got '%s'", base.ScrapingOptions.UserAgent) + } + if !base.DataFormatting.CleanWhitespace { + t.Errorf("Expected DataFormatting.CleanWhitespace to be true") + } + if !base.DataFormatting.RemoveHTML { + t.Errorf("Expected DataFormatting.RemoveHTML to be true") + } + + // Verify that PrintColored was called for each overridden field. + expectedSubstrs := []string{ + "Overriding Version: v2.0", + "Overriding URL.Base: https://override.com", + "Overriding URL.Routes: [", + "Overriding URL.IncludeBase: true", + "Overriding ParseRules.Title: New Title", + "Overriding ParseRules.MetaDescription: New Meta", + "Overriding ParseRules.ArticleContent: New Content", + "Overriding ParseRules.Author: New Author", + "Overriding ParseRules.DatePublished: 2022-01-01", + "Overriding Storage.OutputFormats: [", + "Overriding Storage.SavePath: new_output/", + "Overriding Storage.FileName: new_data", + "Overriding ScrapingOptions.MaxDepth: 5", + "Overriding ScrapingOptions.RateLimit: 2", + "Overriding ScrapingOptions.RetryAttempts: 4", + "Overriding ScrapingOptions.UserAgent: OverrideAgent", + "Overriding DataFormatting.CleanWhitespace: true", + "Overriding DataFormatting.RemoveHTML: true", + } + for _, substr := range expectedSubstrs { + if !strings.Contains(captured, substr) { + t.Errorf("Expected output to contain '%s', got '%s'", substr, captured) + } + } + }, + }, + { + desc: "Nil override leaves config unchanged", + overrideSetup: func() ConfigOverride { + return ConfigOverride{} + }, + validate: func(t *testing.T, base *Config, captured string) { + // Build a default config to compare. + defaultConfig := &Config{} + defaultConfig.ApplyDefaults() + if !reflect.DeepEqual(base, defaultConfig) { + t.Errorf("Expected config to remain unchanged when overrides are nil. Got %+v, expected %+v", base, defaultConfig) + } + // No PrintColored calls should be made. + if captured != "" { + t.Errorf("Expected no output from PrintColored when no overrides are applied, got '%s'", captured) + } + }, + }, + } + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + var captured string + patchColored := monkey.Patch(utils.PrintColored, func(a ...interface{}) { + captured += fmt.Sprint(a...) + }) + defer patchColored.Unpatch() + + // Create a base config with defaults applied. + base := &Config{} + base.ApplyDefaults() + + // Apply the override from this test case. + override := tc.overrideSetup() + base.OverrideConfig(override) + + tc.validate(t, base, captured) + }) + } +} diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index 5c308cb..a5780a4 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -1,19 +1,57 @@ +// File: pkg/crawler/crawler.go + package crawler -// Crawler is responsible for fetching HTML content from URLs. -// Will possibly handle concurrency, rate-limits, etc. later. +/* +Crawler is responsible for fetching HTML content from URLs. + +Usage: + + Create an instance of Crawler using New() and then call FetchURL + to retrieve the HTML content from a specified URL. + +Notes: + - This implementation is currently a stub. + - Future enhancements may include handling HTTP GET requests, + concurrency, rate-limiting, timeouts, retries, and robust error handling. +*/ type Crawler struct { - // We might store config references or concurrency settings here. + // Fields for storing configuration or concurrency settings can be added here. } -// New returns a new instance of Crawler. +/* +New returns a new instance of Crawler. + +Usage: + + c := New() +*/ func New() *Crawler { return &Crawler{} } -// FetchURL fetches the contents of a given URL. -// We'll eventually handle HTTP GET requests, timeouts, retries, etc. +/* +FetchURL retrieves the HTML content from the specified URL. + +Parameters: + - url: A string representing the URL to fetch. + +Returns: + - A string containing the HTML content (if successful) or an empty string. + - An error if the fetch operation fails. + +Usage: + + content, err := c.FetchURL("http://example.com") + if err != nil { + // Handle error. + } + +Notes: + - This function is currently a stub and returns an empty string with a nil error. + - Future implementations will include actual HTTP request handling. +*/ func (c *Crawler) FetchURL(url string) (string, error) { - // Stub: return placeholder HTML or empty string for now + // Stub: return placeholder HTML or empty string for now. return "", nil } diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go new file mode 100644 index 0000000..d7e7c74 --- /dev/null +++ b/pkg/crawler/crawler_test.go @@ -0,0 +1,26 @@ +// File: pkg/crawler/crawler_test.go + +package crawler + +import "testing" + +// TestNew verifies that New returns a valid (non-nil) instance of Crawler. +func TestNew(t *testing.T) { + c := New() + if c == nil { + t.Error("Expected New() to return a non-nil Crawler instance") + } +} + +// TestFetchURL verifies that FetchURL returns an empty string and nil error +// regardless of the input URL, as it is currently a stub. +func TestFetchURL(t *testing.T) { + c := New() + content, err := c.FetchURL("http://example.com") + if err != nil { + t.Errorf("Expected no error from FetchURL, got: %v", err) + } + if content != "" { + t.Errorf("Expected empty content from FetchURL, got: %q", content) + } +} diff --git a/pkg/parser/parser.go b/pkg/parser/parser.go index 75e5396..e146a68 100644 --- a/pkg/parser/parser.go +++ b/pkg/parser/parser.go @@ -1,7 +1,33 @@ +// File: pkg/parser/parser.go + package parser -// ParseHTML analyzes HTML content and extracts data based on config or rules. -// We'll expand this to handle specific selectors, attributes, etc. +/* +ParseHTML analyzes HTML content and extracts data based on configuration or rules. + +Parameters: + - htmlContent: A string containing the HTML to be parsed. + +Returns: + - A map with string keys and values representing the extracted data. + - An error if parsing fails. + +Usage: + + This function is currently a stub. In the future, it will be expanded to handle specific + selectors, attributes, and more complex parsing logic to extract meaningful data from HTML. + +Example: + + data, err := ParseHTML("...") + if err != nil { + // Handle error + } + // Use the extracted data from 'data' + +Notes: + - For now, the function returns an empty map and a nil error. +*/ func ParseHTML(htmlContent string) (map[string]string, error) { // Stub: for now, just return an empty map return map[string]string{}, nil diff --git a/pkg/parser/parser_test.go b/pkg/parser/parser_test.go new file mode 100644 index 0000000..6c41ee6 --- /dev/null +++ b/pkg/parser/parser_test.go @@ -0,0 +1,28 @@ +// File: pkg/parser/parser_test.go + +package parser + +import "testing" + +// TestParseHTML verifies that ParseHTML returns an empty map and no error +// regardless of the input provided. +func TestParseHTML(t *testing.T) { + // Test with a non-empty HTML string. + html := "
Hello, World!
" + data, err := ParseHTML(html) + if err != nil { + t.Errorf("Expected no error for non-empty input, got %v", err) + } + if len(data) != 0 { + t.Errorf("Expected empty map for non-empty input, got %v", data) + } + + // Test with an empty string. + data, err = ParseHTML("") + if err != nil { + t.Errorf("Expected no error for empty input, got %v", err) + } + if len(data) != 0 { + t.Errorf("Expected empty map for empty input, got %v", data) + } +} diff --git a/pkg/storage/storage.go b/pkg/storage/storage.go index 08b6978..e9adc8b 100644 --- a/pkg/storage/storage.go +++ b/pkg/storage/storage.go @@ -1,6 +1,22 @@ +// File: pkg/storage/storage.go + package storage -// StorageOption enumerates the types of storage we might support. +/* +StorageOption enumerates the types of storage we might support. + +Constants: + + JSON - Data stored in JSON format. + XML - Data stored in XML format. + Excel - Data stored in Excel format. + MongoDB - Data stored in a MongoDB database. + MySQL - Data stored in a MySQL database. + +Usage: + + These constants are used with SaveData to specify the desired output format. +*/ type StorageOption int const ( @@ -11,8 +27,31 @@ const ( MySQL ) -// SaveData will eventually accept the extracted data and store it in various formats. -// This could later become multiple functions or a strategy pattern. +/* +SaveData accepts extracted data as a map of strings and stores it in the format specified +by the option parameter. + +Parameters: + - data: A map where each key/value pair represents a piece of extracted data. + - option: A StorageOption value indicating the format in which to store the data. + +Usage: + + This function serves as a placeholder for future storage implementations. + It may later be extended into a strategy pattern to support multiple storage formats, + such as JSON, XML, Excel, MongoDB, or MySQL. + +Example: + + err := SaveData(myData, JSON) + if err != nil { + // Handle the error accordingly. + } + +Notes: + - Currently, this function is a stub and does not perform any storage operations. + - It always returns nil. +*/ func SaveData(data map[string]string, option StorageOption) error { // Stub: for now, do nothing. return nil diff --git a/pkg/storage/storage_test.go b/pkg/storage/storage_test.go new file mode 100644 index 0000000..0ec6fb8 --- /dev/null +++ b/pkg/storage/storage_test.go @@ -0,0 +1,24 @@ +// File: pkg/storage/storage_test.go + +package storage + +import "testing" + +// TestSaveData verifies that SaveData always returns nil regardless of the input. +// This ensures full test coverage for the stub implementation. +func TestSaveData(t *testing.T) { + // Test with non-empty data. + testData := map[string]string{"example": "data"} + options := []StorageOption{JSON, XML, Excel, MongoDB, MySQL} + + for _, opt := range options { + if err := SaveData(testData, opt); err != nil { + t.Errorf("SaveData returned an error for option %v: %v", opt, err) + } + } + + // Also test with an empty map. + if err := SaveData(map[string]string{}, JSON); err != nil { + t.Errorf("SaveData returned an error for empty map: %v", err) + } +} diff --git a/pkg/utils/printcolor.go b/pkg/utils/printcolor.go new file mode 100644 index 0000000..8eba872 --- /dev/null +++ b/pkg/utils/printcolor.go @@ -0,0 +1,160 @@ +// File: pkg/utils/printcolor.go + +package utils + +import ( + "fmt" + "io" + "os" + "reflect" + + "github.com/fatih/color" +) + +/* +FprintColored writes a colored line to the provided writer. + +Parameters: + - w: The io.Writer where output is written. + - prefix: The string to print in the specified color (defaults to white if no color is provided). + - secondary: The string printed immediately after the colored prefix. + - attrs: Variadic color attributes; if none are provided, white is used. + +Usage: + + FprintColored(os.Stdout, "Loaded config from: ", configPath, color.FgHiGreen) + +Notes: + + If 'secondary' is empty, only the colored prefix is printed. +*/ +func FprintColored(w io.Writer, prefix, secondary string, attrs ...color.Attribute) { + var c *color.Color + if len(attrs) > 0 { + c = color.New(attrs...) + } else { + c = color.New(color.FgWhite) + } + if secondary != "" { + fmt.Fprintf(w, "%s%s\n", c.Sprint(prefix), secondary) + } else { + fmt.Fprintln(w, c.Sprint(prefix)) + } +} + +/* +FprintColoredDynamic writes multiple colored string segments to the provided writer on one line. + +Parameters: + - w: The io.Writer for output. + - texts: A slice of strings to print sequentially. + - colors: A slice of color attributes corresponding to each text. + If there are more texts than colors, the last provided color is used for the remaining texts. + +Usage: + + FprintColoredDynamic(os.Stdout, []string{"A ", "B ", "C"}, []color.Attribute{color.FgHiGreen, color.FgHiMagenta}) + +Notes: + + All text segments are printed on the same line, followed by a newline. +*/ +func FprintColoredDynamic(w io.Writer, texts []string, colors []color.Attribute) { + for i, text := range texts { + var attr color.Attribute + if i < len(colors) { + attr = colors[i] + } else if len(colors) > 0 { + attr = colors[len(colors)-1] + } else { + attr = color.FgWhite + } + fmt.Fprint(w, color.New(attr).Sprint(text)) + } + fmt.Fprintln(w) +} + +/* +PrintColoredDynamicToStdout is a convenience function that writes dynamic colored output to os.Stdout. +*/ +func PrintColoredDynamicToStdout(texts []string, colors []color.Attribute) { + FprintColoredDynamic(os.Stdout, texts, colors) +} + +/* +PrintColored is the main exported function for this utility. +It dynamically determines how to print colored output based on the types of arguments passed. + +Usage: + 1. To print a single string: + PrintColored("Just a string") + 2. To print a prefix and secondary string with a color: + PrintColored("Prefix: ", "Secondary", color.FgHiGreen) + 3. To print multiple segments with individual colors: + PrintColored([]string{"Segment1 ", "Segment2 ", "Segment3"}, + []color.Attribute{color.FgHiGreen, color.FgHiMagenta}) + +Behavior: + - If the first argument is a []string, it expects a second argument as []color.Attribute and calls the dynamic printer. + - Otherwise, if the first argument is a string: + - With one argument, prints that string in white. + - With two arguments (both strings), prints the first in white and the second in default formatting. + - With additional arguments of type color.Attribute (or a slice thereof), uses them to color the prefix. +*/ +func PrintColored(args ...interface{}) { + if len(args) == 0 { + return + } + + // Dynamic mode: if the first argument is a []string. + if texts, ok := args[0].([]string); ok { + var colors []color.Attribute + if len(args) > 1 { + if cols, ok := args[1].([]color.Attribute); ok { + colors = cols + } + } + PrintColoredDynamicToStdout(texts, colors) + return + } + + // Otherwise, assume the first argument is a string. + prefix, ok := args[0].(string) + if !ok { + return + } + + secondary := "" + if len(args) >= 2 { + if sec, ok := args[1].(string); ok { + secondary = sec + } + } + + var attrs []color.Attribute + if len(args) > 2 { + // Collect any color attributes (supports individual values or a slice). + for _, arg := range args[2:] { + v := reflect.ValueOf(arg) + switch v.Kind() { + case reflect.Slice: + for i := 0; i < v.Len(); i++ { + item := v.Index(i).Interface() + if attr, ok := item.(color.Attribute); ok { + attrs = append(attrs, attr) + } + } + default: + if attr, ok := arg.(color.Attribute); ok { + attrs = append(attrs, attr) + } + } + } + } + + if len(attrs) == 0 { + attrs = append(attrs, color.FgWhite) + } + + FprintColored(os.Stdout, prefix, secondary, attrs...) +} diff --git a/pkg/utils/printcolor_test.go b/pkg/utils/printcolor_test.go new file mode 100644 index 0000000..7d406b2 --- /dev/null +++ b/pkg/utils/printcolor_test.go @@ -0,0 +1,150 @@ +// File: pkg/utils/printcolor_test.go + +package utils + +import ( + "bytes" + "io" + "os" + "strings" + "testing" + + "github.com/fatih/color" +) + +// init forces ANSI color output during tests. +func init() { + os.Setenv("TERM", "xterm-256color") + color.NoColor = false +} + +// captureStdout redirects os.Stdout during the execution of f() and returns the captured output. +func captureStdout(f func()) string { + oldStdout := os.Stdout + r, w, _ := os.Pipe() + os.Stdout = w + + f() + + w.Close() + os.Stdout = oldStdout + var buf bytes.Buffer + io.Copy(&buf, r) + return buf.String() +} + +// TestPrintColored exercises all branches of the PrintColored function in a table-driven test. +// Each case is documented to explain what branch of PrintColored is being hit. +func TestPrintColored(t *testing.T) { + tests := []struct { + name string + args []interface{} + // expectedContains lists substrings that must appear in the output. + expectedContains []string + // expectEmpty indicates that no output should be produced. + expectEmpty bool + }{ + { + name: "No arguments: nothing should be printed", + args: []interface{}{}, + expectEmpty: true, + }, + { + name: "Non-string first argument: invalid input produces no output", + args: []interface{}{123}, + expectEmpty: true, + }, + { + name: "Single string: prints in default white", + args: []interface{}{"Just a test string"}, + expectedContains: []string{"Just a test string", "\x1b["}, + }, + { + name: "Two strings with individual color: prefix and secondary", + args: []interface{}{"Prefix: ", "Value", color.FgHiGreen}, + expectedContains: []string{"Prefix: ", "Value", "\x1b[92m"}, // \x1b[92m represents high-intensity green. + }, + { + name: "Dynamic mode with valid colors: multiple segments with corresponding colors", + args: []interface{}{ + []string{"Segment1 ", "Segment2 ", "Segment3"}, + []color.Attribute{color.FgHiGreen, color.FgHiMagenta}, + }, + expectedContains: []string{ + "Segment1 ", "Segment2 ", "Segment3", + "\x1b[92m", // ANSI code for high-intensity green. + "\x1b[95m", // ANSI code for high-intensity magenta. + }, + }, + { + name: "Dynamic mode with invalid second argument: defaults to white", + args: []interface{}{ + []string{"Only segment"}, + 123, // Invalid second argument; triggers default white. + }, + expectedContains: []string{"Only segment", "\x1b["}, + }, + { + name: "Mixed arguments with a slice of colors: unpacking color slice correctly", + args: []interface{}{ + "Mixed: ", + "Value", + []color.Attribute{color.FgHiYellow, color.FgHiBlue}, + }, + // fatih/color combines the two attributes into one ANSI sequence (\x1b[93;94m) + expectedContains: []string{ + "Mixed: ", "Value", + "\x1b[93;94m", // Combined ANSI sequence for high-intensity yellow and blue. + }, + }, + } + + // Iterate over each test case. + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.name, func(t *testing.T) { + output := captureStdout(func() { + PrintColored(tc.args...) + }) + if tc.expectEmpty { + if output != "" { + t.Errorf("Expected no output, but got: %q", output) + } + return + } + // Verify that each expected substring is present in the output. + for _, substr := range tc.expectedContains { + if !strings.Contains(output, substr) { + t.Errorf("Test case %q: expected output to contain %q, but it did not. Full output: %q", tc.name, substr, output) + } + } + }) + } +} + +// TestFprintColoredEmptyAttrs directly tests FprintColored with an empty attribute slice. +// This exercise the branch where no color attributes are provided, +// which is never reached via PrintColored (because it always supplies a default). +func TestFprintColoredEmptyAttrs(t *testing.T) { + var buf bytes.Buffer + + // Case 1: secondary is empty. + FprintColored(&buf, "directTest", "") + output := buf.String() + if !strings.Contains(output, "directTest") { + t.Errorf("Expected output to contain %q, got %q", "directTest", output) + } + if !strings.HasSuffix(output, "\n") { + t.Errorf("Expected output to end with a newline, got %q", output) + } + + // Reset buffer for next case. + buf.Reset() + + // Case 2: secondary is non-empty. + FprintColored(&buf, "directTest", "Extra") + output = buf.String() + if !strings.Contains(output, "directTest") || !strings.Contains(output, "Extra") { + t.Errorf("Expected output to contain both %q and %q, got %q", "directTest", "Extra", output) + } +} diff --git a/pkg/utils/printstruct.go b/pkg/utils/printstruct.go new file mode 100644 index 0000000..49e9661 --- /dev/null +++ b/pkg/utils/printstruct.go @@ -0,0 +1,69 @@ +// File: pkg/utils/printstruct.go + +package utils + +import ( + "reflect" + + "github.com/fatih/color" +) + +/* +PrintNonEmptyFields dynamically traverses a struct and prints its non-empty string fields. + +Parameters: + - prefix: A string to prepend to the field name, used to represent nested struct hierarchy (e.g., "Parent.Child."). + - v: The struct or pointer to a struct to be traversed and inspected. + +Usage: + + This function is useful for dynamically inspecting and displaying configurations or other data structures + where the fields may be optional, and only non-empty values are of interest. + +Example: + + Given a struct: + + type Config struct { + URL string + Nested struct { + Title string + } + } + + Calling PrintNonEmptyFields("", configInstance) will output something like: + + URL: http://example.com + Nested.Title: Example Title + +Notes: + - This function relies on the reflect package and assumes that the input is a struct or a pointer to a struct. + - Only string fields are checked for non-emptiness; other types are ignored. + - Colored output is now handled by the PrintColored utility from this package. +*/ +func PrintNonEmptyFields(prefix string, v interface{}) { + val := reflect.ValueOf(v) + + // Handle pointers by obtaining the element value. + if val.Kind() == reflect.Ptr { + val = val.Elem() + } + + typ := val.Type() + + // Iterate over each field of the struct. + for i := 0; i < typ.NumField(); i++ { + field := val.Field(i) + fieldType := typ.Field(i) + fieldName := fieldType.Name + + // If the field is a nested struct, recursively print its non-empty fields. + if field.Kind() == reflect.Struct { + PrintNonEmptyFields(prefix+fieldName+".", field.Interface()) + } else if field.Kind() == reflect.String && field.String() != "" { + // Use PrintColored to output the field name (with a colon) in high-intensity yellow, + // followed by the field value in default formatting. + PrintColored(prefix+fieldName+": ", field.String(), color.FgHiYellow) + } + } +} diff --git a/pkg/utils/printstruct_test.go b/pkg/utils/printstruct_test.go new file mode 100644 index 0000000..b1bd422 --- /dev/null +++ b/pkg/utils/printstruct_test.go @@ -0,0 +1,111 @@ +// File: pkg/utils/printstruct_test.go + +package utils + +import ( + "bytes" + "io" + "os" + "strings" + "testing" +) + +// captureOutput redirects os.Stdout during the execution of f() and returns the captured output. +// (Renamed from captureStdout to avoid conflict with the same helper in printcolor_test.go.) +func captureOutput(f func()) string { + oldStdout := os.Stdout + r, w, _ := os.Pipe() + os.Stdout = w + + f() + + w.Close() + os.Stdout = oldStdout + var buf bytes.Buffer + io.Copy(&buf, r) + return buf.String() +} + +// Define sample structs for testing PrintNonEmptyFields. + +// SimpleStruct contains a string field and a non-string field. Only the string field should be printed. +type SimpleStruct struct { + Name string + Age int +} + +// NestedStruct is used for nesting within another struct. +type NestedStruct struct { + Title string + Comment string +} + +// ComplexStruct demonstrates nested structures. Only non-empty string fields should be printed. +type ComplexStruct struct { + URL string + Nested NestedStruct + Empty string + Other int +} + +// TestPrintNonEmptyFields verifies that PrintNonEmptyFields correctly traverses structs +// (or pointers to structs) and prints non-empty string fields with appropriate prefixes. +func TestPrintNonEmptyFields(t *testing.T) { + tests := []struct { + name string + input interface{} + expected []string // Expected substrings to be found in the printed output. + }{ + { + name: "SimpleStruct with non-empty Name", + input: SimpleStruct{Name: "Alice", Age: 30}, + expected: []string{"Name:", "Alice"}, + }, + { + name: "SimpleStruct with empty Name", + input: SimpleStruct{Name: "", Age: 40}, + expected: []string{}, // No output expected because Name is empty. + }, + { + name: "ComplexStruct with nested non-empty fields", + input: ComplexStruct{ + URL: "http://example.com", + Nested: NestedStruct{ + Title: "Example Title", + Comment: "", + }, + Empty: "", + Other: 10, + }, + expected: []string{ + "URL:", "http://example.com", + "Nested.Title:", "Example Title", + }, + }, + { + name: "Pointer to SimpleStruct with non-empty Name", + input: &SimpleStruct{Name: "Bob", Age: 25}, + expected: []string{"Name:", "Bob"}, + }, + } + + // Iterate over each test case. + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.name, func(t *testing.T) { + output := captureOutput(func() { + PrintNonEmptyFields("", tc.input) + }) + // If no output is expected, verify that the captured output is empty. + if len(tc.expected) == 0 && strings.TrimSpace(output) != "" { + t.Errorf("Test case %q: expected no output, got %q", tc.name, output) + } + // Otherwise, verify that each expected substring is present in the output. + for _, substr := range tc.expected { + if !strings.Contains(output, substr) { + t.Errorf("Test case %q: expected output to contain %q, but it did not. Full output: %q", tc.name, substr, output) + } + } + }) + } +} diff --git a/scrapeycli b/scrapeycli deleted file mode 100755 index c1cd98c..0000000 Binary files a/scrapeycli and /dev/null differ diff --git a/scripts/coverage_formatter.go b/scripts/coverage_formatter.go new file mode 100644 index 0000000..3ca019d --- /dev/null +++ b/scripts/coverage_formatter.go @@ -0,0 +1,130 @@ +package main + +import ( + "bufio" + "fmt" + "io" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + + "github.com/fatih/color" +) + +// detailedCoverageRegex matches typical coverage detail lines from `go tool cover -func`. +// Example: +// +// github.com/.../file.go:31: funcName 100.0% +var detailedCoverageRegex = regexp.MustCompile(`^([^:]+\.go):(\d+):(\s+)(\S+)(\s+)([0-9]+\.[0-9]+%)$`) + +// fallbackCoverageRegex matches coverage percentages in fallback lines (e.g. "total: (statements) 70.0%"). +var fallbackCoverageRegex = regexp.MustCompile(`([0-9]+\.[0-9]+%)`) + +// Coverage thresholds. +const ( + HighCoverageThreshold = 80.0 + MediumCoverageThreshold = 50.0 +) + +// Color styles. +var ( + dirStyle = color.New(color.FgWhite).Add(color.Bold) + fileStyle = color.New(color.FgCyan).Add(color.Bold) + lineNumStyle = color.New(color.FgMagenta).Add(color.Bold) + funcStyle = color.New(color.FgHiBlue) + colorHighCov = color.New(color.FgGreen) + colorMidCov = color.New(color.FgYellow) + colorLowCov = color.New(color.FgRed) +) + +// inputReader is our source for input; it defaults to os.Stdin but can be overridden in tests. +var inputReader io.Reader = os.Stdin + +// exitFunc is used to exit in main(). It defaults to os.Exit but can be overridden in tests. +var exitFunc = os.Exit + +// run reads from the provided reader and writes styled output to stdout. +// It returns an error if a read error occurs. +func run(in io.Reader) error { + scanner := bufio.NewScanner(in) + for scanner.Scan() { + originalLine := scanner.Text() + styledLine := styleCoverageLine(originalLine) + fmt.Println(styledLine) + } + if err := scanner.Err(); err != nil { + fmt.Fprintf(os.Stderr, "Error reading stdin: %v\n", err) + return err + } + return nil +} + +// main calls run(inputReader) and uses exitFunc if an error occurs. +func main() { + if err := run(inputReader); err != nil { + exitFunc(1) + } +} + +// styleCoverageLine returns a styled version of the given line. +// If the line matches detailedCoverageRegex, it processes it accordingly; +// otherwise, it falls back to colorizeCoverageInLine. +func styleCoverageLine(line string) string { + if matches := detailedCoverageRegex.FindStringSubmatch(line); matches != nil { + fullPath := matches[1] + lineNumber := matches[2] + spacingBeforeFunc := matches[3] + funcName := matches[4] + spacingBeforeCoverage := matches[5] + coverageString := matches[6] + coloredFilePath := formatPathAndFile(fullPath) + coloredLineNumber := lineNumStyle.Sprint(lineNumber) + coloredFunction := funcStyle.Sprint(funcName) + coloredCoverage := colorizeCoverage(coverageString) + return fmt.Sprintf("%s:%s:%s%s%s%s", + coloredFilePath, + coloredLineNumber, + spacingBeforeFunc, + coloredFunction, + spacingBeforeCoverage, + coloredCoverage, + ) + } + return colorizeCoverageInLine(line) +} + +// formatPathAndFile splits a file path into directory and file components and colors them. +func formatPathAndFile(fullPath string) string { + dir := filepath.Dir(fullPath) + file := filepath.Base(fullPath) + if dir == "." || dir == "" { + return fileStyle.Sprint(file) + } + return dirStyle.Sprintf("%s/", dir) + fileStyle.Sprint(file) +} + +// colorizeCoverageInLine replaces all coverage percentages in a line with their colored versions. +func colorizeCoverageInLine(line string) string { + return fallbackCoverageRegex.ReplaceAllStringFunc(line, func(match string) string { + return colorizeCoverage(match) + }) +} + +// colorizeCoverage returns a colored string for the given coverage percentage. +func colorizeCoverage(coverageStr string) string { + rawNumber := strings.TrimSuffix(coverageStr, "%") + coverageValue, parseErr := strconv.ParseFloat(rawNumber, 64) + if parseErr != nil { + return coverageStr + } + switch { + case coverageValue >= HighCoverageThreshold: + return colorHighCov.Sprint(coverageStr) + case coverageValue >= MediumCoverageThreshold: + return colorMidCov.Sprint(coverageStr) + default: + return colorLowCov.Sprint(coverageStr) + } +} diff --git a/scripts/coverage_formatter_test.go b/scripts/coverage_formatter_test.go new file mode 100644 index 0000000..3060a71 --- /dev/null +++ b/scripts/coverage_formatter_test.go @@ -0,0 +1,207 @@ +package main + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "testing" +) + +// captureOutput captures stdout during the execution of f. +func captureOutput(f func()) string { + oldStdout := os.Stdout + r, w, _ := os.Pipe() + os.Stdout = w + + f() + + w.Close() + os.Stdout = oldStdout + var buf bytes.Buffer + io.Copy(&buf, r) + return buf.String() +} + +// alwaysErrorReader is a reader that always returns an error. +type alwaysErrorReader struct{} + +func (r alwaysErrorReader) Read(p []byte) (int, error) { + return 0, fmt.Errorf("simulated read error") +} + +// TestColorizeCoverage tests the colorizeCoverage function. +func TestColorizeCoverage(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + {"HighCoverage", "100.0%", colorHighCov.Sprint("100.0%")}, + {"MediumCoverage", "60.0%", colorMidCov.Sprint("60.0%")}, + {"LowCoverage", "40.0%", colorLowCov.Sprint("40.0%")}, + {"InvalidCoverage", "foo%", "foo%"}, + } + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + got := colorizeCoverage(tc.input) + if got != tc.expected { + t.Errorf("colorizeCoverage(%q) = %q; expected %q", tc.input, got, tc.expected) + } + }) + } +} + +// TestColorizeCoverageInLine tests colorizeCoverageInLine. +func TestColorizeCoverageInLine(t *testing.T) { + input := "total: (statements) 70.0%" + expected := fallbackCoverageRegex.ReplaceAllStringFunc(input, func(match string) string { + return colorizeCoverage(match) + }) + got := colorizeCoverageInLine(input) + if got != expected { + t.Errorf("colorizeCoverageInLine(%q) = %q; expected %q", input, got, expected) + } +} + +// TestFormatPathAndFile tests formatPathAndFile. +func TestFormatPathAndFile(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + {"WithDir", "github.com/foo/bar/file.go", + dirStyle.Sprintf("%s/", filepath.Dir("github.com/foo/bar/file.go")) + + fileStyle.Sprint(filepath.Base("github.com/foo/bar/file.go"))}, + {"NoDir", "file.go", fileStyle.Sprint("file.go")}, + } + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + got := formatPathAndFile(tc.input) + if got != tc.expected { + t.Errorf("formatPathAndFile(%q) = %q; expected %q", tc.input, got, tc.expected) + } + }) + } +} + +// TestStyleCoverageLineDetailed tests styleCoverageLine with a detailed line. +func TestStyleCoverageLineDetailed(t *testing.T) { + fullPath := "github.com/foo/bar/file.go" + lineNumber := "31" + spacingBeforeFunc := " " + funcName := "init" + spacingBeforeCoverage := " " + coverageStr := "100.0%" + input := fullPath + ":" + lineNumber + ":" + spacingBeforeFunc + funcName + spacingBeforeCoverage + coverageStr + expected := formatPathAndFile(fullPath) + ":" + + lineNumStyle.Sprint(lineNumber) + ":" + + spacingBeforeFunc + + funcStyle.Sprint(funcName) + + spacingBeforeCoverage + + colorizeCoverage(coverageStr) + got := styleCoverageLine(input) + if got != expected { + t.Errorf("styleCoverageLine(detailed) = %q; expected %q", got, expected) + } +} + +// TestStyleCoverageLineFallback tests styleCoverageLine with fallback input. +func TestStyleCoverageLineFallback(t *testing.T) { + input := "total: (statements) 70.0%" + expected := colorizeCoverageInLine(input) + got := styleCoverageLine(input) + if got != expected { + t.Errorf("styleCoverageLine(fallback) = %q; expected %q", got, expected) + } +} + +// TestRunError tests run() with an error using alwaysErrorReader. +func TestRunError(t *testing.T) { + err := run(alwaysErrorReader{}) + if err == nil { + t.Error("Expected run() to return error, got nil") + } +} + +// Save original values to restore after tests. +var ( + originalInputReader = inputReader + originalExitFunc = exitFunc +) + +// TestMainNoError tests main() when run() succeeds. +func TestMainNoError(t *testing.T) { + exitCalled := false + exitCode := 0 + exitFunc = func(code int) { + exitCalled = true + exitCode = code + } + defer func() { exitFunc = originalExitFunc }() + + inputReader = strings.NewReader("total: (statements) 70.0%\n") + defer func() { inputReader = originalInputReader }() + + main() + if exitCalled { + t.Errorf("Expected main() not to call exitFunc, but it was called with code %d", exitCode) + } +} + +// TestMainWithError tests main() when run() returns an error. +func TestMainWithError(t *testing.T) { + exitCalled := false + exitCode := 0 + exitFunc = func(code int) { + exitCalled = true + exitCode = code + } + defer func() { exitFunc = originalExitFunc }() + + inputReader = alwaysErrorReader{} + defer func() { inputReader = originalInputReader }() + + main() + if !exitCalled { + t.Error("Expected main() to call exitFunc due to error, but it was not called") + } + if exitCode != 1 { + t.Errorf("Expected exit code 1, got %d", exitCode) + } +} + +// TestMainIntegration tests main() by overriding inputReader and capturing output. +func TestMainIntegration(t *testing.T) { + inputLines := []string{ + "github.com/foo/bar/file.go:31: init 100.0%", + "total: (statements) 70.0%", + } + input := strings.Join(inputLines, "\n") + inputReader = strings.NewReader(input) + defer func() { inputReader = originalInputReader }() + + output := captureOutput(func() { main() }) + scanner := bufio.NewScanner(strings.NewReader(output)) + var outputLines []string + for scanner.Scan() { + outputLines = append(outputLines, scanner.Text()) + } + if len(outputLines) != len(inputLines) { + t.Fatalf("Expected %d output lines, got %d", len(inputLines), len(outputLines)) + } + detailedExpected := styleCoverageLine(inputLines[0]) + if outputLines[0] != detailedExpected { + t.Errorf("Main integration detailed line = %q; expected %q", outputLines[0], detailedExpected) + } + fallbackExpected := styleCoverageLine(inputLines[1]) + if outputLines[1] != fallbackExpected { + t.Errorf("Main integration fallback line = %q; expected %q", outputLines[1], fallbackExpected) + } +} diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..8a167eb --- /dev/null +++ b/test/README.md @@ -0,0 +1,4 @@ +# Test Directory + +This directory is reserved for additional non-specific component tests. +Place your extra tests here to have them run during `make test`. diff --git a/test/fail_test.go b/test/fail_test.go new file mode 100644 index 0000000..aaa37db --- /dev/null +++ b/test/fail_test.go @@ -0,0 +1,11 @@ +// File: test/fail_test.go + +package main + +import ( + "testing" +) + +func TestAlwaysFail(t *testing.T) { + // t.Errorf("This test is designed to fail!") +}