Skip to content

Commit 2e966d7

Browse files
authored
Merge pull request #9 from grove-platform/add-filtering-to-create-url-list
Add filtering capability to `create-url-list` tool
2 parents 6e65165 + 0c756dd commit 2e966d7

3 files changed

Lines changed: 249 additions & 68 deletions

File tree

create-url-list/README.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,19 @@ go build
1111
## Usage
1212

1313
```bash
14-
./create-url-list [--quiet] <csv-file-path> [range] [output-path]
14+
./create-url-list [--quiet] [--contains <substring>] <csv-file-path> [range] [output-path]
1515
```
1616

1717
### Arguments
1818

1919
1. **--quiet** (optional): Suppress all informational output (warnings, info messages, and success messages). Only errors will be displayed. Useful when using this tool in pipelines.
20-
2. **csv-file-path** (required): Path to the input CSV file
21-
3. **range** (optional): Rank range in format `min-max` (e.g., `1-50`). Default: `1-250`
20+
2. **--contains** (optional): Filter URLs to only include those containing the specified substring. For example, `--contains /manual/` will only include URLs that contain `/manual/` in their path.
21+
3. **csv-file-path** (required): Path to the input CSV file
22+
4. **range** (optional): Rank range in format `min-max` (e.g., `1-50`). Default: `1-250`
2223
- Specifies which ranked entries to include in the output
2324
- `1-50` means "get the top 50 pages by pageviews"
2425
- `51-100` means "get pages ranked 51-100 by pageviews"
25-
4. **output-path** (optional): Custom output file path. Default: `output/YYYY-MM-DD_HH-MM-SS_range.csv`
26+
5. **output-path** (optional): Custom output file path. Default: `output/YYYY-MM-DD_HH-MM-SS_range.csv`
2627

2728
### Examples
2829

@@ -39,8 +40,17 @@ go build
3940
# Specify custom output path
4041
./create-url-list data.csv 1-100 results/top-100.csv
4142

43+
# Filter for URLs containing "/manual/" (e.g., database manual documentation)
44+
./create-url-list --contains /manual/ data.csv
45+
46+
# Filter for URLs containing "/manual/" and get top 50
47+
./create-url-list --contains /manual/ data.csv 1-50
48+
4249
# Use in a pipeline with quiet mode (no informational output)
4350
./create-url-list --quiet data.csv 1-50 output.csv
51+
52+
# Combine multiple flags: quiet mode with URL filtering
53+
./create-url-list --quiet --contains /manual/ data.csv 1-50 output.csv
4454
```
4555

4656
## Input Requirements

create-url-list/main.go

Lines changed: 65 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
type Record struct {
1717
Page string
1818
MeasureValues int
19+
Rank int // Original rank before any filtering
1920
}
2021

2122
type Config struct {
@@ -34,19 +35,32 @@ func main() {
3435
func run() error {
3536
// Parse command-line arguments
3637
if len(os.Args) < 2 {
37-
return fmt.Errorf("usage: %s [--quiet] <csv-file-path> [range] [output-path]", os.Args[0])
38+
return fmt.Errorf("usage: %s [--quiet] [--contains <substring>] <csv-file-path> [range] [output-path]", os.Args[0])
3839
}
3940

40-
// Check for --quiet flag
41+
// Check for --quiet and --contains flags
4142
quiet := false
43+
containsFilter := ""
4244
args := os.Args[1:]
43-
if len(args) > 0 && args[0] == "--quiet" {
44-
quiet = true
45-
args = args[1:] // Remove --quiet from args
45+
46+
// Process flags
47+
for len(args) > 0 && strings.HasPrefix(args[0], "--") {
48+
if args[0] == "--quiet" {
49+
quiet = true
50+
args = args[1:] // Remove --quiet from args
51+
} else if args[0] == "--contains" {
52+
if len(args) < 2 {
53+
return fmt.Errorf("--contains flag requires a substring argument")
54+
}
55+
containsFilter = args[1]
56+
args = args[2:] // Remove --contains and its argument from args
57+
} else {
58+
return fmt.Errorf("unknown flag: %s", args[0])
59+
}
4660
}
4761

4862
if len(args) < 1 {
49-
return fmt.Errorf("usage: %s [--quiet] <csv-file-path> [range] [output-path]", os.Args[0])
63+
return fmt.Errorf("usage: %s [--quiet] [--contains <substring>] <csv-file-path> [range] [output-path]", os.Args[0])
5064
}
5165

5266
inputPath := args[0]
@@ -82,7 +96,7 @@ func run() error {
8296
}
8397

8498
// Read and process CSV
85-
records, err := processCSV(inputPath, config.IgnoreURLs, quiet)
99+
records, err := processCSV(inputPath, config.IgnoreURLs, containsFilter, quiet)
86100
if err != nil {
87101
return err
88102
}
@@ -145,7 +159,7 @@ func loadConfig(configPath string) (*Config, error) {
145159
return &config, nil
146160
}
147161

148-
func processCSV(inputPath string, ignoreURLs []string, quiet bool) ([]Record, error) {
162+
func processCSV(inputPath string, ignoreURLs []string, containsFilter string, quiet bool) ([]Record, error) {
149163
file, err := os.Open(inputPath)
150164
if err != nil {
151165
return nil, fmt.Errorf("failed to open file: %v", err)
@@ -184,8 +198,8 @@ func processCSV(inputPath string, ignoreURLs []string, quiet bool) ([]Record, er
184198
ignoreMap[url] = true
185199
}
186200

187-
// Read and collect all Pageviews records
188-
var records []Record
201+
// Read and collect all Pageviews records (before filtering by contains)
202+
var allRecords []Record
189203
var skippedURLs []string
190204
var ignoredURLs []string
191205
for {
@@ -223,12 +237,33 @@ func processCSV(inputPath string, ignoreURLs []string, quiet bool) ([]Record, er
223237
continue // Skip non-integer values
224238
}
225239

226-
records = append(records, Record{
240+
allRecords = append(allRecords, Record{
227241
Page: page,
228242
MeasureValues: measureValue,
229243
})
230244
}
231245

246+
// Sort all records by pageviews (highest to lowest) to establish true ranking
247+
sort.Slice(allRecords, func(i, j int) bool {
248+
return allRecords[i].MeasureValues > allRecords[j].MeasureValues
249+
})
250+
251+
// Assign ranks to all records
252+
for i := range allRecords {
253+
allRecords[i].Rank = i + 1
254+
}
255+
256+
// Now filter by contains substring if specified
257+
var records []Record
258+
var filteredURLs []string
259+
for _, record := range allRecords {
260+
if containsFilter != "" && !strings.Contains(record.Page, containsFilter) {
261+
filteredURLs = append(filteredURLs, record.Page)
262+
continue
263+
}
264+
records = append(records, record)
265+
}
266+
232267
// Report skipped URLs
233268
if !quiet && len(skippedURLs) > 0 {
234269
fmt.Fprintf(os.Stderr, "Warning: Skipped %d URL(s) that do not match expected structure (www.*):\n", len(skippedURLs))
@@ -245,33 +280,27 @@ func processCSV(inputPath string, ignoreURLs []string, quiet bool) ([]Record, er
245280
}
246281
}
247282

283+
// Report filtered URLs
284+
if !quiet && len(filteredURLs) > 0 {
285+
fmt.Fprintf(os.Stderr, "Info: Filtered out %d URL(s) not containing '%s':\n", len(filteredURLs), containsFilter)
286+
for _, url := range filteredURLs {
287+
fmt.Fprintf(os.Stderr, " - %s\n", url)
288+
}
289+
}
290+
248291
return records, nil
249292
}
250293

251294
func writeOutput(records []Record, outputPath, rangeStr string, minRank, maxRank int, showPageviews, showHeaders bool) (string, error) {
252-
// Sort by Measure Values (highest to lowest) to establish ranking
253-
sort.Slice(records, func(i, j int) bool {
254-
return records[i].MeasureValues > records[j].MeasureValues
255-
})
256-
257-
// Slice to get only the entries within the specified rank range
258-
// minRank and maxRank are 1-based, so we need to convert to 0-based indices
259-
startIdx := minRank - 1
260-
endIdx := maxRank
261-
262-
// Ensure we don't go out of bounds
263-
if startIdx < 0 {
264-
startIdx = 0
265-
}
266-
if endIdx > len(records) {
267-
endIdx = len(records)
268-
}
269-
if startIdx >= len(records) {
270-
// No records in this range
271-
records = []Record{}
272-
} else {
273-
records = records[startIdx:endIdx]
295+
// Records are already sorted and have ranks assigned
296+
// Filter to get only the entries within the specified rank range
297+
var filteredRecords []Record
298+
for _, record := range records {
299+
if record.Rank >= minRank && record.Rank <= maxRank {
300+
filteredRecords = append(filteredRecords, record)
301+
}
274302
}
303+
records = filteredRecords
275304

276305
// Determine output directory and filename
277306
var outputDir, filename string
@@ -317,18 +346,17 @@ func writeOutput(records []Record, outputPath, rangeStr string, minRank, maxRank
317346
}
318347

319348
// Write records with rank number, URL, and optionally pageviews
320-
for i, record := range records {
321-
rank := startIdx + i + 1 // Calculate the actual rank
349+
for _, record := range records {
322350
var row []string
323351
if showPageviews {
324352
row = []string{
325-
strconv.Itoa(rank),
353+
strconv.Itoa(record.Rank),
326354
record.Page,
327355
strconv.Itoa(record.MeasureValues),
328356
}
329357
} else {
330358
row = []string{
331-
strconv.Itoa(rank),
359+
strconv.Itoa(record.Rank),
332360
record.Page,
333361
}
334362
}

0 commit comments

Comments
 (0)