diff --git a/cmd/dotagents/config.go b/cmd/dotagents/config.go index 18dbf4a..b8518cc 100644 --- a/cmd/dotagents/config.go +++ b/cmd/dotagents/config.go @@ -93,6 +93,7 @@ func mergeConfig(base *config, overlay config) { base.MCPServers = mergeByKey(base.MCPServers, overlay.MCPServers, func(s mcpServerConfig) string { return strings.TrimSpace(s.Name) }) base.Hooks = mergeByKey(base.Hooks, overlay.Hooks, func(h hookConfig) string { return strings.TrimSpace(h.Name) }) base.Plugins = mergeByKey(base.Plugins, overlay.Plugins, func(p pluginConfig) string { return strings.TrimSpace(p.Name) }) + base.Sources = mergeByKey(base.Sources, overlay.Sources, func(s sourceConfig) string { return strings.TrimSpace(s.Name) }) } func mergeByKey[T any](base []T, overlay []T, key func(T) string) []T { diff --git a/cmd/dotagents/main.go b/cmd/dotagents/main.go index 36bcc28..cfb4205 100644 --- a/cmd/dotagents/main.go +++ b/cmd/dotagents/main.go @@ -15,6 +15,7 @@ type config struct { ExternalSkills []externalSkillSource `yaml:"external_skills"` Plugins []pluginConfig `yaml:"plugins,omitempty"` Hooks []hookConfig `yaml:"hooks,omitempty"` + Sources []sourceConfig `yaml:"sources,omitempty"` } type pluginConfig struct { @@ -171,6 +172,8 @@ func run(args []string) error { return err } return runDoctor(opts) + case "sources": + return runSources(args[1:]) case "external": return runExternal(args[1:]) case "promote": @@ -245,6 +248,7 @@ func printUsage() { fmt.Println(" dotagents mcp add --command Add/update canonical managed MCP") fmt.Println(" dotagents mcp import Import native MCP into canonical config") fmt.Println(" dotagents mcp remove Remove canonical managed MCP") + fmt.Println(" dotagents sources [--json|--compact] [name] Show external data source availability") fmt.Println(" dotagents external list Show external skill sources and lock state") fmt.Println(" dotagents external update [name ...] Move external sources to latest and rewrite the lock") fmt.Println(" dotagents plugin add Install Claude Code plugin delivery for claude-code") diff --git a/cmd/dotagents/sources.go b/cmd/dotagents/sources.go new file mode 100644 index 0000000..60b67aa --- /dev/null +++ b/cmd/dotagents/sources.go @@ -0,0 +1,292 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" +) + +type sourceConfig struct { + Name string `yaml:"name"` + Enabled *bool `yaml:"enabled,omitempty"` + Preferred string `yaml:"preferred,omitempty"` +} + +type methodType string + +const ( + methodCLI methodType = "cli" + methodMCP methodType = "mcp" + methodAPI methodType = "api" + methodFallback methodType = "fallback" +) + +type sourceMethodDef struct { + Name string + Type methodType + Priority int + Detect string // binary name for LookPath + Check string // shell one-liner for deeper check (run via sh -c) + MCP string // MCP server name to look up in config + Auth string // credential location (display) + Setup string // one-line setup instruction +} + +type sourceDef struct { + Name string + Desc string + Methods []sourceMethodDef + DefaultOn bool + ToSRisk string // "", "high" +} + +type methodStatus struct { + Name string `json:"name"` + Type methodType `json:"type"` + Priority int `json:"priority"` + Available bool `json:"available"` + Reason string `json:"reason,omitempty"` +} + +type sourceStatus struct { + Name string `json:"name"` + Enabled bool `json:"enabled"` + Best string `json:"best"` + Methods []methodStatus `json:"methods"` +} + +var sourceRegistry = []sourceDef{ + { + Name: "x.com", Desc: "X.com / Twitter", DefaultOn: true, ToSRisk: "high", + Methods: []sourceMethodDef{ + {Name: "x-cli", Type: methodCLI, Priority: 1, Detect: "x-cli", Auth: "~/.x-cli/credentials.json", Setup: "x-cli auth login"}, + {Name: "x-api-v2", Type: methodAPI, Priority: 2, Check: "test -s ~/.x-api/credentials.json", Auth: "~/.x-api/credentials.json", Setup: "register OAuth consumer app at developer.x.com"}, + {Name: "websearch", Type: methodFallback, Priority: 99}, + }, + }, + { + Name: "reddit", Desc: "Reddit", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "rdt-cli", Type: methodCLI, Priority: 1, Detect: "rdt", Setup: "uv tool install rdt-cli"}, + {Name: "pullpush", Type: methodAPI, Priority: 2}, + {Name: "websearch", Type: methodFallback, Priority: 99}, + }, + }, + { + Name: "hacker-news", Desc: "Hacker News", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "algolia", Type: methodAPI, Priority: 1}, + }, + }, + { + Name: "discord", Desc: "Discord", DefaultOn: false, ToSRisk: "high", + Methods: []sourceMethodDef{ + {Name: "discord-cli", Type: methodCLI, Priority: 1, Detect: "discord", Check: "test -n \"$DISCORD_TOKEN\"", Auth: "$DISCORD_TOKEN env var", Setup: "uv tool install kabi-discord-cli"}, + {Name: "websearch", Type: methodFallback, Priority: 99}, + }, + }, + { + Name: "github", Desc: "GitHub", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "gh", Type: methodCLI, Priority: 1, Detect: "gh", Setup: "gh auth login"}, + }, + }, + { + Name: "linkedin", Desc: "LinkedIn", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "linkedin-mcp", Type: methodMCP, Priority: 1, MCP: "linkedin", Setup: "uvx linkedin-scraper-mcp --login"}, + {Name: "websearch", Type: methodFallback, Priority: 99}, + }, + }, + { + Name: "telegram", Desc: "Telegram", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "tg", Type: methodCLI, Priority: 1, Detect: "tg", Auth: "~/.local/share/dotagents/telegram-readonly/telegram.session", Setup: "cd ~/.agents/mcp/telegram-readonly && uv run python login.py"}, + {Name: "telegram-mcp", Type: methodMCP, Priority: 2, MCP: "telegram-readonly"}, + }, + }, + { + Name: "google", Desc: "Google Workspace", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "gws", Type: methodCLI, Priority: 1, Detect: "gws", Auth: "~/.config/gws/", Setup: "gws auth login"}, + }, + }, + { + Name: "web-search", Desc: "Web search (general)", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "tavily-mcp", Type: methodMCP, Priority: 1, MCP: "tavily"}, + {Name: "native", Type: methodFallback, Priority: 99}, + }, + }, + { + Name: "job-portals", Desc: "ATS portals (Greenhouse, Ashby, Lever)", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "portals-scan", Type: methodCLI, Priority: 1, Detect: "go", Check: "test -d ~/.agents/skills/jobs/tools/portals-scan"}, + }, + }, + { + Name: "glassdoor", Desc: "Glassdoor / Levels.fyi", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "websearch", Type: methodFallback, Priority: 99}, + }, + }, + { + Name: "hugging-face", Desc: "Hugging Face Hub", DefaultOn: true, + Methods: []sourceMethodDef{ + {Name: "hf-api", Type: methodAPI, Priority: 1}, + }, + }, + { + Name: "ra", Desc: "Resident Advisor", DefaultOn: false, + Methods: nil, + }, +} + +func checkMethodAvailability(m sourceMethodDef, cfg config, home string) (bool, string) { + switch m.Type { + case methodFallback: + return true, "" + case methodAPI: + if m.Check == "" { + return true, "" + } + return runShellCheck(m.Check, home) + case methodCLI: + if m.Detect != "" { + if _, err := exec.LookPath(m.Detect); err != nil { + return false, fmt.Sprintf("%s not on PATH", m.Detect) + } + } + if m.Check != "" { + return runShellCheck(m.Check, home) + } + return true, "" + case methodMCP: + if m.MCP == "" { + return false, "no MCP server name" + } + for _, srv := range cfg.MCPServers { + if srv.Name == m.MCP && srv.Enabled { + return true, "" + } + } + return false, fmt.Sprintf("MCP server %q not configured or disabled", m.MCP) + } + return false, "unknown method type" +} + +func runShellCheck(cmd string, home string) (bool, string) { + cmd = strings.ReplaceAll(cmd, "~", home) + parts := strings.Fields(cmd) + if len(parts) >= 3 && parts[0] == "test" { + op := parts[1] + arg := strings.Trim(parts[2], "\"'") + switch op { + case "-s": + info, err := os.Stat(arg) + if err != nil { + return false, err.Error() + } + if info.Size() == 0 { + return false, "file is empty" + } + return true, "" + case "-d": + info, err := os.Stat(arg) + if err != nil { + return false, err.Error() + } + if !info.IsDir() { + return false, "not a directory" + } + return true, "" + case "-n": + if strings.HasPrefix(arg, "$") { + if os.Getenv(strings.TrimPrefix(arg, "$")) == "" { + return false, "environment variable not set" + } + return true, "" + } + } + } + out, err := exec.Command("sh", "-c", cmd).CombinedOutput() + if err != nil { + reason := strings.TrimSpace(string(out)) + if reason == "" { + reason = "check failed" + } + return false, reason + } + return true, "" +} + +func resolveSourceStatus(cfg config, home string) []sourceStatus { + overrides := make(map[string]sourceConfig, len(cfg.Sources)) + for _, s := range cfg.Sources { + overrides[s.Name] = s + } + + var results []sourceStatus + for _, def := range sourceRegistry { + enabled := def.DefaultOn + preferred := "" + if ov, ok := overrides[def.Name]; ok { + if ov.Enabled != nil { + enabled = *ov.Enabled + } + preferred = ov.Preferred + } + + ss := sourceStatus{ + Name: def.Name, + Enabled: enabled, + } + + if !enabled || len(def.Methods) == 0 { + results = append(results, ss) + continue + } + + bestPriority := 999 + for _, m := range def.Methods { + avail, reason := checkMethodAvailability(m, cfg, home) + ms := methodStatus{ + Name: m.Name, + Type: m.Type, + Priority: m.Priority, + Available: avail, + Reason: reason, + } + ss.Methods = append(ss.Methods, ms) + + if avail && m.Name == preferred { + ss.Best = m.Name + bestPriority = -1 + } else if avail && m.Priority < bestPriority { + ss.Best = m.Name + bestPriority = m.Priority + } + } + + results = append(results, ss) + } + return results +} + +func findSourceDef(name string) *sourceDef { + for i := range sourceRegistry { + if sourceRegistry[i].Name == name { + return &sourceRegistry[i] + } + } + return nil +} + +func expandAuthPath(auth string, home string) string { + if strings.HasPrefix(auth, "~/") { + return filepath.Join(home, auth[2:]) + } + return auth +} diff --git a/cmd/dotagents/sources_cli.go b/cmd/dotagents/sources_cli.go new file mode 100644 index 0000000..630806b --- /dev/null +++ b/cmd/dotagents/sources_cli.go @@ -0,0 +1,174 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "os" + "strings" +) + +func runSources(args []string) error { + fs := flag.NewFlagSet("sources", flag.ContinueOnError) + jsonFlag := fs.Bool("json", false, "output JSON") + compactFlag := fs.Bool("compact", false, "one-line output for skill consumption") + configPath := fs.String("config", "", "config file path") + if err := fs.Parse(args); err != nil { + return err + } + + home, err := os.UserHomeDir() + if err != nil { + return fmt.Errorf("resolve home: %w", err) + } + + repoRoot, _, err := findRoots() + if err != nil { + return fmt.Errorf("find roots: %w", err) + } + + cfg, err := loadConfig(repoRoot, home, *configPath) + if err != nil { + return fmt.Errorf("load config: %w", err) + } + + statuses := resolveSourceStatus(cfg, home) + + // Single source detail mode + if fs.NArg() > 0 { + name := fs.Arg(0) + for _, ss := range statuses { + if ss.Name == name { + if *jsonFlag { + return printJSON(ss) + } + return printSourceDetail(ss, home) + } + } + return fmt.Errorf("unknown source %q", name) + } + + if *jsonFlag { + return printJSON(statuses) + } + if *compactFlag { + return printCompact(statuses) + } + return printTable(statuses) +} + +func printJSON(v interface{}) error { + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + return enc.Encode(v) +} + +func printCompact(statuses []sourceStatus) error { + var parts []string + for _, ss := range statuses { + if !ss.Enabled || ss.Best == "" { + continue + } + parts = append(parts, ss.Name+":"+ss.Best) + } + fmt.Println(strings.Join(parts, " ")) + return nil +} + +func printTable(statuses []sourceStatus) error { + for _, ss := range statuses { + name := padRight(ss.Name, 16) + if !ss.Enabled { + fmt.Printf("%s disabled\n", name) + continue + } + if len(ss.Methods) == 0 { + fmt.Printf("%s not integrated\n", name) + continue + } + + best := padRight(ss.Best, 16) + + var methodParts []string + var unavailable []string + for _, ms := range ss.Methods { + if ms.Available { + tag := "ok" + if ms.Type == methodFallback { + tag = "fallback" + } + methodParts = append(methodParts, fmt.Sprintf("%s [%s]", ms.Name, tag)) + } else { + reason := ms.Reason + if reason == "" { + reason = "unavailable" + } + unavailable = append(unavailable, fmt.Sprintf("%s: %s", ms.Name, reason)) + } + } + + methods := strings.Join(methodParts, " ") + line := fmt.Sprintf("%s %s %s", name, best, methods) + if len(unavailable) > 0 { + line += " (" + strings.Join(unavailable, "; ") + ")" + } + fmt.Println(line) + } + return nil +} + +func printSourceDetail(ss sourceStatus, home string) error { + def := findSourceDef(ss.Name) + if def == nil { + return fmt.Errorf("no definition for %q", ss.Name) + } + + fmt.Printf("%s (%s)\n", ss.Name, def.Desc) + fmt.Printf(" enabled: %v\n", ss.Enabled) + if def.ToSRisk != "" { + fmt.Printf(" tos-risk: %s\n", def.ToSRisk) + } + if ss.Best != "" { + fmt.Printf(" best: %s\n", ss.Best) + } + fmt.Println() + + if len(def.Methods) == 0 { + fmt.Println(" no methods (not integrated)") + return nil + } + + fmt.Println(" methods:") + for i, m := range def.Methods { + status := "NOT AVAILABLE" + reason := "" + if i < len(ss.Methods) { + if ss.Methods[i].Available { + status = "OK" + } else { + reason = ss.Methods[i].Reason + } + } + + marker := " " + if m.Name == ss.Best { + marker = "*" + } + fmt.Printf(" %s [%d] %-16s %-14s %s\n", marker, m.Priority, m.Name, status, reason) + + if m.Auth != "" { + fmt.Printf(" auth: %s\n", expandAuthPath(m.Auth, home)) + } + if m.Setup != "" { + fmt.Printf(" setup: %s\n", m.Setup) + } + } + return nil +} + +func padRight(s string, width int) string { + if len(s) >= width { + return s + } + return s + strings.Repeat(" ", width-len(s)) +} diff --git a/cmd/dotagents/sources_test.go b/cmd/dotagents/sources_test.go new file mode 100644 index 0000000..954e1e9 --- /dev/null +++ b/cmd/dotagents/sources_test.go @@ -0,0 +1,195 @@ +package main + +import ( + "testing" +) + +func TestResolveSourceStatus_defaults(t *testing.T) { + cfg := config{Version: 1} + statuses := resolveSourceStatus(cfg, "/tmp/fakehome") + + if len(statuses) != len(sourceRegistry) { + t.Fatalf("expected %d sources, got %d", len(sourceRegistry), len(statuses)) + } + + byName := make(map[string]sourceStatus) + for _, ss := range statuses { + byName[ss.Name] = ss + } + + // discord defaults to disabled + if byName["discord"].Enabled { + t.Error("discord should be disabled by default") + } + + // github defaults to enabled + if !byName["github"].Enabled { + t.Error("github should be enabled by default") + } + + // ra defaults to disabled + if byName["ra"].Enabled { + t.Error("ra should be disabled by default") + } + + // hacker-news algolia is always available (public API, no check) + hn := byName["hacker-news"] + if hn.Best != "algolia" { + t.Errorf("hacker-news best should be algolia, got %q", hn.Best) + } +} + +func TestResolveSourceStatus_enableOverride(t *testing.T) { + enabled := true + cfg := config{ + Version: 1, + Sources: []sourceConfig{ + {Name: "discord", Enabled: &enabled}, + }, + } + statuses := resolveSourceStatus(cfg, "/tmp/fakehome") + + for _, ss := range statuses { + if ss.Name == "discord" { + if !ss.Enabled { + t.Error("discord should be enabled via override") + } + return + } + } + t.Error("discord not found in statuses") +} + +func TestResolveSourceStatus_disableOverride(t *testing.T) { + disabled := false + cfg := config{ + Version: 1, + Sources: []sourceConfig{ + {Name: "github", Enabled: &disabled}, + }, + } + statuses := resolveSourceStatus(cfg, "/tmp/fakehome") + + for _, ss := range statuses { + if ss.Name == "github" { + if ss.Enabled { + t.Error("github should be disabled via override") + } + if ss.Best != "" { + t.Errorf("disabled source should have empty best, got %q", ss.Best) + } + return + } + } + t.Error("github not found in statuses") +} + +func TestResolveSourceStatus_mcpCheck(t *testing.T) { + cfg := config{ + Version: 1, + MCPServers: []mcpServerConfig{ + {Name: "tavily", Enabled: true, Command: "npx"}, + }, + } + statuses := resolveSourceStatus(cfg, "/tmp/fakehome") + + for _, ss := range statuses { + if ss.Name == "web-search" { + if ss.Best != "tavily-mcp" { + t.Errorf("web-search best should be tavily-mcp, got %q", ss.Best) + } + return + } + } + t.Error("web-search not found") +} + +func TestResolveSourceStatus_mcpMissing(t *testing.T) { + cfg := config{Version: 1} + statuses := resolveSourceStatus(cfg, "/tmp/fakehome") + + for _, ss := range statuses { + if ss.Name == "linkedin" { + // linkedin-mcp should be unavailable without MCP config + for _, ms := range ss.Methods { + if ms.Name == "linkedin-mcp" && ms.Available { + t.Error("linkedin-mcp should be unavailable without MCP server config") + } + } + // best should fall back to websearch + if ss.Best != "websearch" { + t.Errorf("linkedin best should be websearch without MCP, got %q", ss.Best) + } + return + } + } + t.Error("linkedin not found") +} + +func TestResolveSourceStatus_preferredOverride(t *testing.T) { + cfg := config{ + Version: 1, + Sources: []sourceConfig{ + {Name: "reddit", Preferred: "pullpush"}, + }, + } + statuses := resolveSourceStatus(cfg, "/tmp/fakehome") + + for _, ss := range statuses { + if ss.Name == "reddit" { + if ss.Best != "pullpush" { + t.Errorf("reddit best should be pullpush via preferred override, got %q", ss.Best) + } + return + } + } + t.Error("reddit not found") +} + +func TestFindSourceDef(t *testing.T) { + def := findSourceDef("x.com") + if def == nil { + t.Fatal("x.com not found in registry") + } + if def.Desc != "X.com / Twitter" { + t.Errorf("unexpected desc: %s", def.Desc) + } + + if findSourceDef("nonexistent") != nil { + t.Error("nonexistent source should return nil") + } +} + +func TestSourceConfigMerge(t *testing.T) { + enabled := true + base := config{ + Version: 1, + Agents: []agentConfig{{Name: "test", Enabled: true, SkillRoot: "/tmp"}}, + Sources: []sourceConfig{ + {Name: "discord"}, + {Name: "github"}, + }, + } + overlay := config{ + Sources: []sourceConfig{ + {Name: "discord", Enabled: &enabled, Preferred: "discord-cli"}, + }, + } + mergeConfig(&base, overlay) + + if len(base.Sources) != 2 { + t.Fatalf("expected 2 sources after merge, got %d", len(base.Sources)) + } + for _, s := range base.Sources { + if s.Name == "discord" { + if s.Enabled == nil || !*s.Enabled { + t.Error("discord enabled should be true after merge") + } + if s.Preferred != "discord-cli" { + t.Errorf("discord preferred should be discord-cli, got %q", s.Preferred) + } + return + } + } + t.Error("discord not found after merge") +} diff --git a/docs/sources.md b/docs/sources.md new file mode 100644 index 0000000..42d69ea --- /dev/null +++ b/docs/sources.md @@ -0,0 +1,245 @@ +# External Data Sources Registry + +Centralized reference for every external data source accessible to dotagents skills, MCPs, and CLIs. +Each source entry covers: what it is, how we access it, auth requirements, which skills use it, known limitations, and gaps. + +## Runtime status + +```bash +dotagents sources # table: what's available now +dotagents sources --compact # one-liner for skill prompts +dotagents sources --json # structured output +dotagents sources x.com # single source detail with auth/setup info +``` + +Configure in `dotagents.yaml` under `sources:`. Per-machine overrides in `dotagents.local.yaml`. + +## Design goals + +1. One place to check "can I get data from X?" before writing a new skill or wiring a new MCP. +2. Each source has a canonical access method with a priority. `dotagents sources` picks the best available. +3. Auth credentials live in known locations; never duplicated across tools or pasted into chat. +4. Read-only by default. Write/mutate actions require explicit opt-in per source. +5. Configurable per machine: set `preferred: x-api-v2` in `dotagents.local.yaml` to override method selection. + +## Access method taxonomy + +| Method | Examples | Tradeoffs | +|---|---|---| +| **CLI tool** | `gh`, `x-cli`, `rdt-cli`, `tg`, `gws` | Best for agents; scriptable; output parseable. Preferred when available. | +| **MCP server** | linkedin-scraper-mcp, telegram-readonly, tavily | Native tool-use in agent context; higher setup cost; session lifecycle to manage. | +| **Public API** | HN Algolia, Greenhouse boards, HF Hub | No auth, no breakage risk from session rotation. Limited to public data. | +| **Web search** | Tavily MCP, native WebSearch, WebFetch | Fallback for anything; low precision; can't access authenticated content. | +| **Browser automation** | Playwright (LinkedIn MCP), x-cli login | Fragile; session/cookie rotation; ToS risk. Last resort. | + +--- + +## Source catalog + +### X.com / Twitter + +| | | +|---|---| +| Canonical access | `x-cli` CLI (Gladium-AI/x-cli) | +| Protocol | X internal GraphQL API via captured browser session | +| Auth | Browser cookies + bearer/CSRF in `~/.x-cli/credentials.json`. Login: `x-cli login` (requires Chrome). | +| Skills | x-cli, x-sim, tech-search, repo-eval | +| Read | timelines, tweets, users, search, followers/following | +| Write | None wired. x-sim is offline simulation only. | +| Fallback | `site:x.com ` via WebSearch | +| Limitations | GraphQL query ID drift breaks all commands until manual refresh. Rate-limited. No official API v2 wired (API key exists in research/API_KEYS.md but returns Unauthorized -- likely needs OAuth consumer flow). | +| Gaps | Official API v2 integration would eliminate endpoint drift risk. | + +### Hacker News + +| | | +|---|---| +| Canonical access | Algolia HN Search API (public, no auth) | +| Protocol | `GET https://hn.algolia.com/api/v1/search?query=&tags=story&hitsPerPage=N` | +| Auth | None | +| Skills | tech-search, repo-eval | +| Read | Story search by relevance. Thread bodies via WebFetch on `news.ycombinator.com/item?id=`. | +| Write | N/A | +| Fallback | `site:news.ycombinator.com` via WebSearch for older threads | +| Limitations | `search_by_date` endpoint returns zero-comment noise; disallowed in tech-search. | +| Gaps | None significant. Stable public API. | + +### Reddit + +| | | +|---|---| +| Canonical access | `rdt-cli` (Python, `uv tool install rdt-cli`) | +| Protocol | Reddit browser cookies; `rdt search`, `rdt read` | +| Auth | Browser cookies. VPS/headless = 403 without cookies. | +| Skills | tech-search | +| Read | Subreddit search, thread reading. Target subs: r/ExperiencedDevs, r/ClaudeAI, r/ClaudeCode, r/LocalLLaMA, r/MachineLearning, r/devops, r/commandline, r/neovim, r/Python, r/mcp, r/cybersecurity | +| Write | None | +| Fallback 1 | Pullpush API: `https://api.pullpush.io/reddit/search/submission/?q=&size=5&sort=desc&sort_type=score` (historical only, no auth) | +| Fallback 2 | `site:reddit.com ` via WebSearch | +| Limitations | Cookie-dependent; headless forbidden. Pullpush is historical, not real-time. | +| Gaps | No official Reddit API integration (requires app registration + OAuth2). Would solve headless access. | + +### Discord + +| | | +|---|---| +| Canonical access | `discord-cli` (Python, `uv tool install kabi-discord-cli`) | +| Protocol | Discord user-token API. Local SQLite sync for channels. | +| Auth | `DISCORD_TOKEN` env var (user token, not bot). CLI can scan local browser/Discord storage. | +| Skills | tech-search (opt-in only, ML/LLM/agents/evals topics) | +| Read | Channel search, message reading. Known guilds: NousResearch (`1053877538025386074`), Anthropic/Claude (`1456350064065904867`). | +| Write | None | +| Fallback | `rtk proxy curl` direct Discord API (same token) | +| Limitations | User-token usage may violate Discord ToS. Account risk noted. Opt-in only. Local search requires prior sync. | +| Gaps | Bot-token access would be ToS-safe but requires server admin approval. | + +### GitHub + +| | | +|---|---| +| Canonical access | `gh` CLI + `gh api graphql` | +| Protocol | GitHub REST + GraphQL via OAuth token | +| Auth | `gh auth login` (OAuth browser flow) | +| Skills | tech-search, repo-eval, pr-triage, jobs (career page detection) | +| Read | Repos, issues, PRs, releases, commits, check runs, languages, contributor stats. Clone to `~/Public/` for code analysis. | +| Write | PR comments via pr-triage (draft-then-review by default; direct replies to bot comments allowed). | +| Limitations | None significant. Always-available. | +| Gaps | None. | + +### LinkedIn + +| | | +|---|---| +| Canonical access | `linkedin-scraper-mcp` MCP server (Playwright browser automation) | +| Protocol | Headless Chrome via Playwright. No official LinkedIn API. | +| Auth | One-time browser login: `uvx linkedin-scraper-mcp --login`. Session managed by Playwright. | +| Skills | jobs | +| MCP tools | get_inbox, search_conversations, get_conversation, get_person_profile, get_company_profile, get_company_employees, get_job_details, search_people, search_jobs, search_companies | +| Read | DMs, profiles, job postings, company data | +| Write | connect_with_person, send_message exist but are NOT used autonomously. Jobs skill: "never send messages or connection requests unless user explicitly asks." | +| Config | dotagents.yaml, ~/.codex/config.toml, ~/.hermes/config.yaml (all enabled) | +| Limitations | Browser automation fragile. Session expires. MCP unavailability handled gracefully by jobs skill. | +| Gaps | No CLI wrapper. Official LinkedIn API requires company-level partnership. | + +### Telegram + +| | | +|---|---| +| Canonical access | `tg` CLI + `telegram-readonly` MCP server (both backed by same Telethon daemon) | +| Protocol | MTProto via Telethon. Singleton daemon on Unix socket `~/.local/share/dotagents/telegram-readonly/daemon.sock`. | +| Auth | `TELEGRAM_API_ID` + `TELEGRAM_API_HASH` in `~/.agents/mcp/telegram-readonly/.env`. Session file at `~/.local/share/dotagents/telegram-readonly/telegram.session`. Login: `uv run python login.py`. | +| Skills | tg | +| MCP tools | list_dialogs, get_recent_messages, search_messages, get_chat_info | +| CLI commands | `tg dialogs`, `tg read`, `tg search`, `tg info` | +| Read | Chat listing, message reading, search across chats | +| Write | None. Read-only enforced in both MCP and CLI. | +| Config | ~/.codex/config.toml, ~/.hermes/config.yaml | +| Limitations | Idle timeout 30m (daemon sleeps). Session can expire requiring re-login. | +| Gaps | No write capability by design. Hermes has `TELEGRAM_HOME_CHANNEL: 38369051` for its own gateway but that's separate from dotagents. | + +### Google Workspace (Gmail, Drive, Docs, Sheets, Calendar) + +| | | +|---|---| +| Canonical access | `gws` CLI | +| Protocol | Google REST APIs via OAuth | +| Auth | `gws auth login`. Credentials at `~/.config/gws/`. Also `~/.hermes/auth/google_oauth.json` for Hermes. | +| Skills | gws, jobs (Gmail recruiter signal queries) | +| Read | Full CRUD on Drive, Docs, Sheets; Gmail search/read; Calendar events | +| Write | Docs, Sheets, Drive file ops. Gmail: "never send mail unless explicitly asked." | +| Limitations | None significant. | +| Gaps | Claude Code MCP for Google Workspace exists (claude.ai Gmail/Drive/Calendar) but is separate from gws CLI. Parity not enforced. | + +### Web Search (general) + +| | | +|---|---| +| Canonical access | Tavily MCP server (primary) + native agent WebSearch (fallback) | +| Protocol | Tavily: `npx mcp-remote https://mcp.tavily.com/mcp`. Native: agent built-in. | +| Auth | Tavily: OAuth via mcp-remote (server-side, no local API key). Native: none. | +| Skills | tech-search, repo-eval, jobs, and general fallback for any source | +| MCP tools | tavily_search, tavily_extract, tavily_crawl, tavily_map, tavily_research | +| Read | Web search, page extraction, site mapping | +| Limitations | Tavily auth mechanism opaque (mcp-remote handles it). WebFetch blocked by many sites (RA, LinkedIn, etc.). | +| Gaps | Tavily auth needs verification if it stops working. | + +### Job ATS Portals (Greenhouse, Ashby, Lever) + +| | | +|---|---| +| Canonical access | `portals-scan` Go tool (skills/jobs/tools/portals-scan/) | +| Protocol | Direct HTTP to public board APIs. No auth. | +| Endpoints | Greenhouse: `boards-api.greenhouse.io/v1/boards/{slug}/jobs`; Ashby: `api.ashbyhq.com/posting-api/job-board/{slug}`; Lever: `api.lever.co/v0/postings/{slug}` | +| Skills | jobs (/jobs scan) | +| Read | Job listings with title, location, team, compensation (where available) | +| Write | None | +| Limitations | Only 3 ATS platforms. Companies with custom career portals need `enabled: false`. 10 concurrent scans. | +| Gaps | No Workday, iCIMS, SmartRecruiters, BambooHR. These cover a large chunk of enterprise hiring. | + +### Glassdoor / Levels.fyi + +| | | +|---|---| +| Canonical access | WebSearch only (no structured integration) | +| Skills | jobs (/jobs check -- comp research) | +| Read | Whatever WebSearch returns for `"company" "role" salary levels.fyi glassdoor` | +| Limitations | No structured scraping. "If no data, say so -- do not invent numbers." | +| Gaps | levels.fyi has an unofficial API. Glassdoor blocks scraping aggressively. Low priority unless comp research becomes frequent. | + +### Hugging Face + +| | | +|---|---| +| Canonical access | HF Hub REST API (public) | +| Protocol | `GET https://huggingface.co/api/models//` | +| Auth | None for public models | +| Skills | repo-eval (ML/model repos only) | +| Read | Model/dataset metadata: public/private state, downloads, likes, tags, files | +| Limitations | Read-only. Pickle formats flagged as trusted-code artifacts, never loaded. | +| Gaps | `huggingface-cli` exists but not wired. Would give authenticated access to gated models. | + +### Resident Advisor (RA) + +| | | +|---|---| +| Canonical access | **Not integrated.** Undocumented GraphQL endpoint, reverse-engineered. | +| Protocol | POST `https://ra.co/graphql` with operation `GET_EVENT_LISTINGS`. Area codes: Amsterdam=29. | +| Auth | None (public, but blocks standard user-agents and WebFetch/Tavily). Requires browser-like UA + Referer header. | +| Read | Event listings with title, venue, artists, genres, attendance, times | +| Limitations | No official API. GraphQL schema undocumented, reverse-engineered. May break. RA blocks all fetch tools (403). | +| Gaps | Needs a CLI wrapper or skill integration if event lookups become recurring. | + +--- + +## Not yet integrated (known wants) + +| Source | Why | Effort | Priority | +|---|---|---|---| +| **RA** | Event discovery in NL/EU cities | Low: wrap the known GraphQL endpoint in a CLI or skill step | Nice-to-have | +| **Glassdoor (structured)** | Comp research for job search | High: aggressive anti-scraping; unofficial APIs break | Low | +| **Workday/iCIMS ATS** | Enterprise career portals | Medium: each platform is different, no unified API | Low unless targeting large employers | +| **Reddit official API** | Headless-safe access without cookies | Medium: app registration + OAuth2 flow | Medium (fixes VPS access) | +| **X official API v2** | Eliminate endpoint drift from x-cli | Medium: need OAuth consumer flow, not just bearer token | Medium (stability) | +| **HF CLI** | Gated model access | Low: `pip install huggingface-cli`, wire into repo-eval | Low | + +## Auth credential locations + +| Source | Location | Rotation | +|---|---|---| +| X.com | `~/.x-cli/credentials.json` | Manual re-login on session expiry or GraphQL drift | +| Reddit | Browser cookies (managed by rdt-cli) | Manual; expires unpredictably | +| Discord | `DISCORD_TOKEN` env var | Manual; user-token, high risk | +| GitHub | `gh auth` keychain | Long-lived OAuth; rarely expires | +| LinkedIn | Playwright session (linkedin-scraper-mcp) | `uvx linkedin-scraper-mcp --login` on expiry | +| Telegram | `~/.local/share/dotagents/telegram-readonly/telegram.session` | Re-login via `uv run python login.py` | +| Google | `~/.config/gws/` + `~/.hermes/auth/google_oauth.json` | OAuth refresh; rarely expires | +| Tavily | mcp-remote managed (server-side) | Unknown; needs verification | + +## Principles + +1. **CLI-first.** If a CLI exists, prefer it over MCP. MCPs are for agent-native tool-use where CLI piping is awkward. +2. **Read-only default.** Write actions (sending messages, posting comments, connecting) require explicit user opt-in per invocation. +3. **Graceful degradation.** Every source with auth has a documented fallback (usually WebSearch). Skills must not block on MCP unavailability. +4. **No secrets in chat.** Credentials stay in their canonical locations. Agents never paste tokens, cookies, or API keys into conversation logs. +5. **ToS awareness.** Sources using unofficial access (x-cli, discord user-token, LinkedIn Playwright) are labeled with risk. Opt-in where risk is high. +6. **Agent-agnostic.** Source access methods work across Claude Code, Codex, Hermes, Droid. Agent-specific wiring details go in dotagents.yaml, not here. diff --git a/dotagents.yaml b/dotagents.yaml index 83a518c..c4a391d 100644 --- a/dotagents.yaml +++ b/dotagents.yaml @@ -211,6 +211,33 @@ mcp_servers: - hermes - droid - pi +sources: + - name: x.com + enabled: true + - name: reddit + enabled: true + - name: hacker-news + enabled: true + - name: discord + enabled: false + - name: github + enabled: true + - name: linkedin + enabled: true + - name: telegram + enabled: true + - name: google + enabled: true + - name: web-search + enabled: true + - name: job-portals + enabled: true + - name: glassdoor + enabled: true + - name: hugging-face + enabled: true + - name: ra + enabled: false hooks: - name: memory-session-start enabled: true diff --git a/plugins/dotagents/skills/tech-search/SKILL.md b/plugins/dotagents/skills/tech-search/SKILL.md index 4428cde..6024bd6 100644 --- a/plugins/dotagents/skills/tech-search/SKILL.md +++ b/plugins/dotagents/skills/tech-search/SKILL.md @@ -31,7 +31,9 @@ The guiding rule: broad web finds the map; source-specific searches verify the t ## Sources -Search sources in parallel when practical, but do not force every source. Skipped sources are fine when they are irrelevant, unauthenticated, or low-signal. +Before searching, run `dotagents sources --compact` to discover available methods. Use the best available method for each source; fall back to the next if it fails at runtime. + +Search sources in parallel when practical, but do not force every source. Skipped sources are fine when they are irrelevant, unauthenticated, or low-signal. If a source shows as disabled in `dotagents sources`, do not attempt it. Reference: `references/reddit-discord-cli-eval.md` records the repo evaluation behind the `rdt-cli` and `discord-cli` recommendations. @@ -86,38 +88,44 @@ Read threads at `https://news.ycombinator.com/item?id=` and cite the H ### 4. Reddit -Preferred path when installed: +Use the method shown by `dotagents sources reddit`. + +**Target subreddits:** r/ExperiencedDevs, r/ClaudeAI, r/ClaudeCode, r/LocalLLaMA, r/MachineLearning, r/devops, r/commandline, r/neovim, r/Python, r/mcp, r/cybersecurity +Pick 2-3 relevant subreddits. Avoid broad Reddit for ambiguous terms; it will chase engagement from irrelevant communities. Add context keywords, e.g. `uv poetry Python packaging`, not `poetry`. + +**rdt-cli** (when best method is rdt-cli): ```bash rdt search "" -s relevance -t month -n 10 --compact --json rdt search "" -r -s top -t year -n 10 --compact --json rdt read -n 20 --json ``` -**Target subreddits:** r/ExperiencedDevs, r/ClaudeAI, r/ClaudeCode, r/LocalLLaMA, r/MachineLearning, r/devops, r/commandline, r/neovim, r/Python, r/mcp, r/cybersecurity +On VPS/headless hosts, `rdt search` may return Reddit `forbidden` without browser cookies; do not copy cookie secrets into chat. -Pick 2-3 relevant subreddits. Avoid broad Reddit for ambiguous terms; it will chase engagement from irrelevant communities. Add context keywords, e.g. `uv poetry Python packaging`, not `poetry`. +**pullpush** (fallback for historical posts or VPS): `https://api.pullpush.io/reddit/search/submission/?q=&size=5&sort=desc&sort_type=score` -Raw Reddit `.json` endpoints often 403 and should be treated as a fallback only. On VPS/headless hosts, `rdt search` may return Reddit `forbidden` without browser cookies; do not copy cookie secrets into chat. For historical posts or VPS fallback, use Pullpush API (`https://api.pullpush.io/reddit/search/submission/?q=&size=5&sort=desc&sort_type=score`). +**websearch** (fallback): `site:reddit.com ` via WebSearch. ### 5. X.com -Use `x-cli` for X.com searches. Check auth with `x-cli auth status`. +Use the method shown by `dotagents sources x.com`. Treat X as commentary unless the author is primary to the topic. **Power users:** @karpathy, @fchollet, @hardmaru, @thorstenball, @thdxr, @steipete, @banteg +**x-cli** (when best method is x-cli): ```bash x-cli search "(from:karpathy OR from:fchollet) " --type latest --count 10 --json x-cli search " (recommended OR \"game changer\")" --type top --count 10 --json ``` -Fallback: `site:x.com ` via WebSearch if x-cli auth is broken. Treat X as commentary unless the author is primary to the topic. +**websearch** (fallback): `site:x.com ` via WebSearch. ### 6. Discord -Discord remains opt-in because it uses user-token auth and may carry account-risk. Search Discord only when the topic is relevant to known communities (ML, LLMs, Claude, agents, evals, fine-tuning, etc). Skip for generic/unrelated topics. +Discord is disabled by default (ToS risk). If `dotagents sources` shows it disabled, skip entirely. When enabled, search only for topics relevant to known communities (ML, LLMs, Claude, agents, evals, fine-tuning). Skip for generic/unrelated topics. -**Preferred CLI for repeated/community monitoring**: `discord-cli` (`uv tool install kabi-discord-cli`) can sync accessible Discord channels into local SQLite, then search/export them with structured YAML/JSON. Use it only for accounts the user controls. Do not ask the user to paste raw Discord tokens into chat logs. +**discord-cli** (when enabled and available): can sync accessible Discord channels into local SQLite, then search/export with structured YAML/JSON. Do not ask the user to paste raw Discord tokens into chat logs. ```bash discord status --yaml diff --git a/skills/tech-search/SKILL.md b/skills/tech-search/SKILL.md index 4428cde..6024bd6 100644 --- a/skills/tech-search/SKILL.md +++ b/skills/tech-search/SKILL.md @@ -31,7 +31,9 @@ The guiding rule: broad web finds the map; source-specific searches verify the t ## Sources -Search sources in parallel when practical, but do not force every source. Skipped sources are fine when they are irrelevant, unauthenticated, or low-signal. +Before searching, run `dotagents sources --compact` to discover available methods. Use the best available method for each source; fall back to the next if it fails at runtime. + +Search sources in parallel when practical, but do not force every source. Skipped sources are fine when they are irrelevant, unauthenticated, or low-signal. If a source shows as disabled in `dotagents sources`, do not attempt it. Reference: `references/reddit-discord-cli-eval.md` records the repo evaluation behind the `rdt-cli` and `discord-cli` recommendations. @@ -86,38 +88,44 @@ Read threads at `https://news.ycombinator.com/item?id=` and cite the H ### 4. Reddit -Preferred path when installed: +Use the method shown by `dotagents sources reddit`. + +**Target subreddits:** r/ExperiencedDevs, r/ClaudeAI, r/ClaudeCode, r/LocalLLaMA, r/MachineLearning, r/devops, r/commandline, r/neovim, r/Python, r/mcp, r/cybersecurity +Pick 2-3 relevant subreddits. Avoid broad Reddit for ambiguous terms; it will chase engagement from irrelevant communities. Add context keywords, e.g. `uv poetry Python packaging`, not `poetry`. + +**rdt-cli** (when best method is rdt-cli): ```bash rdt search "" -s relevance -t month -n 10 --compact --json rdt search "" -r -s top -t year -n 10 --compact --json rdt read -n 20 --json ``` -**Target subreddits:** r/ExperiencedDevs, r/ClaudeAI, r/ClaudeCode, r/LocalLLaMA, r/MachineLearning, r/devops, r/commandline, r/neovim, r/Python, r/mcp, r/cybersecurity +On VPS/headless hosts, `rdt search` may return Reddit `forbidden` without browser cookies; do not copy cookie secrets into chat. -Pick 2-3 relevant subreddits. Avoid broad Reddit for ambiguous terms; it will chase engagement from irrelevant communities. Add context keywords, e.g. `uv poetry Python packaging`, not `poetry`. +**pullpush** (fallback for historical posts or VPS): `https://api.pullpush.io/reddit/search/submission/?q=&size=5&sort=desc&sort_type=score` -Raw Reddit `.json` endpoints often 403 and should be treated as a fallback only. On VPS/headless hosts, `rdt search` may return Reddit `forbidden` without browser cookies; do not copy cookie secrets into chat. For historical posts or VPS fallback, use Pullpush API (`https://api.pullpush.io/reddit/search/submission/?q=&size=5&sort=desc&sort_type=score`). +**websearch** (fallback): `site:reddit.com ` via WebSearch. ### 5. X.com -Use `x-cli` for X.com searches. Check auth with `x-cli auth status`. +Use the method shown by `dotagents sources x.com`. Treat X as commentary unless the author is primary to the topic. **Power users:** @karpathy, @fchollet, @hardmaru, @thorstenball, @thdxr, @steipete, @banteg +**x-cli** (when best method is x-cli): ```bash x-cli search "(from:karpathy OR from:fchollet) " --type latest --count 10 --json x-cli search " (recommended OR \"game changer\")" --type top --count 10 --json ``` -Fallback: `site:x.com ` via WebSearch if x-cli auth is broken. Treat X as commentary unless the author is primary to the topic. +**websearch** (fallback): `site:x.com ` via WebSearch. ### 6. Discord -Discord remains opt-in because it uses user-token auth and may carry account-risk. Search Discord only when the topic is relevant to known communities (ML, LLMs, Claude, agents, evals, fine-tuning, etc). Skip for generic/unrelated topics. +Discord is disabled by default (ToS risk). If `dotagents sources` shows it disabled, skip entirely. When enabled, search only for topics relevant to known communities (ML, LLMs, Claude, agents, evals, fine-tuning). Skip for generic/unrelated topics. -**Preferred CLI for repeated/community monitoring**: `discord-cli` (`uv tool install kabi-discord-cli`) can sync accessible Discord channels into local SQLite, then search/export them with structured YAML/JSON. Use it only for accounts the user controls. Do not ask the user to paste raw Discord tokens into chat logs. +**discord-cli** (when enabled and available): can sync accessible Discord channels into local SQLite, then search/export with structured YAML/JSON. Do not ask the user to paste raw Discord tokens into chat logs. ```bash discord status --yaml