From 8585ea9e54259e1b6a5474f78a42fb525861633e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 17 Jan 2026 07:50:10 +0000 Subject: [PATCH 1/4] Initial plan From 772d9d6f6a369fbf3895c524107731ef3b25b9d3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 17 Jan 2026 07:55:25 +0000 Subject: [PATCH 2/4] Add no-topic search feature with environment variables and tests Co-authored-by: DevSecNinja <14926452+DevSecNinja@users.noreply.github.com> --- backend/.env.example | 12 + backend/app/api/routes.py | 12 +- backend/app/cli.py | 22 +- backend/app/services/github_service.py | 116 +++++++++- backend/app/services/indexer.py | 22 +- backend/app/services/scheduler.py | 13 +- backend/tests/test_no_topic_search.py | 291 +++++++++++++++++++++++++ docker-compose.yml | 4 + 8 files changed, 475 insertions(+), 17 deletions(-) create mode 100644 backend/tests/test_no_topic_search.py diff --git a/backend/.env.example b/backend/.env.example index f8bf7d2..a2b871a 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -20,3 +20,15 @@ ENVIRONMENT=production # Example: ROOT_PATH=/api/v1 if your cloud platform handles routing # and you want the app accessible at the root URL ROOT_PATH= + +# No-Topic Search Configuration (optional, for testing) +# Set to "true" to search for automation files without requiring the "hadiscover" topic +# This is useful for testing scenarios to find more diverse repositories +# WARNING: Keep this "false" in production to maintain opt-in privacy +ENABLE_NO_TOPIC_SEARCH=false + +# Maximum Repositories Configuration (optional, for testing) +# Set a maximum number of repositories to index (e.g., 10 for testing) +# Leave empty or unset for no limit +# This is useful for testing to avoid performance issues with large result sets +MAX_REPOSITORIES= diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py index 28713a0..9a4e0ae 100644 --- a/backend/app/api/routes.py +++ b/backend/app/api/routes.py @@ -273,7 +273,17 @@ async def trigger_indexing( async def run_indexing(): """Background task to run indexing.""" - indexer = IndexingService() + # Get no-topic search configuration + enable_no_topic_search = os.getenv( + "ENABLE_NO_TOPIC_SEARCH", "false" + ).lower() in ("true", "1", "yes") + max_repositories_str = os.getenv("MAX_REPOSITORIES") + max_repositories = int(max_repositories_str) if max_repositories_str else None + + indexer = IndexingService( + enable_no_topic_search=enable_no_topic_search, + max_repositories=max_repositories, + ) # Create a new session for background task from app.models import SessionLocal diff --git a/backend/app/cli.py b/backend/app/cli.py index 45dcc4a..4d7973f 100644 --- a/backend/app/cli.py +++ b/backend/app/cli.py @@ -41,8 +41,28 @@ async def run_indexing(): if not github_token: logger.warning("GITHUB_TOKEN not set - API rate limits will be lower") + # Get no-topic search configuration + enable_no_topic_search = os.getenv("ENABLE_NO_TOPIC_SEARCH", "false").lower() in ( + "true", + "1", + "yes", + ) + max_repositories_str = os.getenv("MAX_REPOSITORIES") + max_repositories = int(max_repositories_str) if max_repositories_str else None + + if enable_no_topic_search: + logger.info("No-topic search enabled") + if max_repositories: + logger.info(f"Maximum repositories limit: {max_repositories}") + else: + logger.info("Topic-based search enabled (default)") + # Create indexing service - indexer = IndexingService(github_token=github_token) + indexer = IndexingService( + github_token=github_token, + enable_no_topic_search=enable_no_topic_search, + max_repositories=max_repositories, + ) # Get database session (this also initializes the database) db = get_db_session() diff --git a/backend/app/services/github_service.py b/backend/app/services/github_service.py index 361a013..43479d6 100644 --- a/backend/app/services/github_service.py +++ b/backend/app/services/github_service.py @@ -34,9 +34,23 @@ class GitHubService: "ha-discover", ] # Support both topics for backwards compatibility - def __init__(self, token: Optional[str] = None): - """Initialize GitHub service with optional authentication token.""" + def __init__( + self, + token: Optional[str] = None, + enable_no_topic_search: bool = False, + max_repositories: Optional[int] = None, + ): + """ + Initialize GitHub service with optional authentication token. + + Args: + token: GitHub personal access token + enable_no_topic_search: If True, search for automation files without topic requirement + max_repositories: Maximum number of repositories to return (None = no limit) + """ self.token = token or os.getenv("GITHUB_TOKEN") + self.enable_no_topic_search = enable_no_topic_search + self.max_repositories = max_repositories self.headers = { "Accept": "application/vnd.github.v3+json", } @@ -68,6 +82,7 @@ def _check_rate_limit(self, response: httpx.Response, operation: str) -> None: async def search_repositories(self, per_page: int = 100) -> List[Dict]: """ Search for repositories with the hadiscover or ha-discover topics. + If enable_no_topic_search is True, search for automation files without topic requirement. Args: per_page: Number of results per page (max 100) @@ -79,14 +94,15 @@ async def search_repositories(self, per_page: int = 100) -> List[Dict]: seen_repos = set() # Track repos to avoid duplicates async with httpx.AsyncClient() as client: - # Search for each topic - for topic in self.SEARCH_TOPICS: + if self.enable_no_topic_search: + # Search for repositories with automation files (no topic requirement) + # Use a broad search for Home Assistant automation files page = 1 while True: try: url = f"{self.BASE_URL}/search/repositories" params = { - "q": f"topic:{topic}", + "q": "automations.yaml in:path", "per_page": per_page, "page": page, } @@ -126,21 +142,99 @@ async def search_repositories(self, per_page: int = 100) -> List[Dict]: } ) - # Check if there are more pages - if len(items) < per_page: + # Check if we've reached the maximum number of repositories + if ( + self.max_repositories is not None + and len(all_repositories) >= self.max_repositories + ): + logger.info( + f"Reached max repository limit: {self.max_repositories}" + ) + break + + # Check if we've reached the maximum or there are no more pages + if ( + self.max_repositories is not None + and len(all_repositories) >= self.max_repositories + ) or len(items) < per_page: break page += 1 except httpx.HTTPError as e: logger.error( - f"Error searching repositories with topic '{topic}': {e}" + f"Error searching repositories with automation files: {e}" ) break - logger.info( - f"Found {len(all_repositories)} repositories with topics {self.SEARCH_TOPICS}" - ) + logger.info( + f"Found {len(all_repositories)} repositories with automation files (no topic search)" + ) + else: + # Original topic-based search + # Search for each topic + for topic in self.SEARCH_TOPICS: + page = 1 + while True: + try: + url = f"{self.BASE_URL}/search/repositories" + params = { + "q": f"topic:{topic}", + "per_page": per_page, + "page": page, + } + + response = await client.get( + url, headers=self.headers, params=params, timeout=30.0 + ) + + # Check for rate limiting (status 429 or 403 with rate limit message) + self._check_rate_limit(response, "search_repositories") + + response.raise_for_status() + + data = response.json() + items = data.get("items", []) + + if not items: + break + + for repo in items: + repo_key = f"{repo['owner']['login']}/{repo['name']}" + # Skip if we've already seen this repo + if repo_key in seen_repos: + continue + + seen_repos.add(repo_key) + all_repositories.append( + { + "name": repo["name"], + "owner": repo["owner"]["login"], + "description": repo.get("description", ""), + "url": repo["html_url"], + "default_branch": repo.get( + "default_branch", "main" + ), + "stars": repo.get("stargazers_count", 0), + } + ) + + # Check if there are more pages + if len(items) < per_page: + break + + page += 1 + + except httpx.HTTPError as e: + logger.error( + f"Error searching repositories with topic '{topic}': {e}" + ) + break + + logger.info( + f"Found {len(all_repositories)} repositories with topics {self.SEARCH_TOPICS}" + ) + return all_repositories async def get_file_content( diff --git a/backend/app/services/indexer.py b/backend/app/services/indexer.py index 49cec63..b9da22b 100644 --- a/backend/app/services/indexer.py +++ b/backend/app/services/indexer.py @@ -16,9 +16,25 @@ class IndexingService: """Service for indexing Home Assistant automations from GitHub repositories.""" - def __init__(self, github_token: Optional[str] = None): - """Initialize indexing service with GitHub API access.""" - self.github_service = GitHubService(token=github_token) + def __init__( + self, + github_token: Optional[str] = None, + enable_no_topic_search: bool = False, + max_repositories: Optional[int] = None, + ): + """ + Initialize indexing service with GitHub API access. + + Args: + github_token: GitHub personal access token + enable_no_topic_search: If True, search for automation files without topic requirement + max_repositories: Maximum number of repositories to index (None = no limit) + """ + self.github_service = GitHubService( + token=github_token, + enable_no_topic_search=enable_no_topic_search, + max_repositories=max_repositories, + ) self.parser = AutomationParser() async def index_repositories(self, db: Session) -> dict: diff --git a/backend/app/services/scheduler.py b/backend/app/services/scheduler.py index da4d94c..3e12da1 100644 --- a/backend/app/services/scheduler.py +++ b/backend/app/services/scheduler.py @@ -46,9 +46,20 @@ async def run_indexing_task(self): if not github_token: logger.warning("GITHUB_TOKEN not set - API rate limits will be lower") + # Get no-topic search configuration + enable_no_topic_search = os.getenv( + "ENABLE_NO_TOPIC_SEARCH", "false" + ).lower() in ("true", "1", "yes") + max_repositories_str = os.getenv("MAX_REPOSITORIES") + max_repositories = int(max_repositories_str) if max_repositories_str else None + # Create indexing service if not already created if not self.indexer: - self.indexer = IndexingService(github_token=github_token) + self.indexer = IndexingService( + github_token=github_token, + enable_no_topic_search=enable_no_topic_search, + max_repositories=max_repositories, + ) # Get database session db = self._get_db() diff --git a/backend/tests/test_no_topic_search.py b/backend/tests/test_no_topic_search.py new file mode 100644 index 0000000..469fe4d --- /dev/null +++ b/backend/tests/test_no_topic_search.py @@ -0,0 +1,291 @@ +"""Tests for no-topic search functionality.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from app.services.github_service import GitHubService +from app.services.indexer import IndexingService + + +@pytest.mark.asyncio +async def test_github_service_no_topic_search_disabled_by_default(): + """Test that GitHubService defaults to topic-based search.""" + service = GitHubService() + assert service.enable_no_topic_search is False + assert service.max_repositories is None + + +@pytest.mark.asyncio +async def test_github_service_no_topic_search_enabled(): + """Test that GitHubService can enable no-topic search.""" + service = GitHubService(enable_no_topic_search=True) + assert service.enable_no_topic_search is True + + +@pytest.mark.asyncio +async def test_github_service_max_repositories(): + """Test that GitHubService respects max_repositories limit.""" + service = GitHubService(enable_no_topic_search=True, max_repositories=10) + assert service.max_repositories == 10 + + +@pytest.mark.asyncio +async def test_search_repositories_with_no_topic_search(): + """Test that search_repositories uses different query when no-topic search is enabled.""" + service = GitHubService(enable_no_topic_search=True) + + # Mock response with test data + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "items": [ + { + "name": "test-repo", + "owner": {"login": "testowner"}, + "description": "Test repository", + "html_url": "https://github.com/testowner/test-repo", + "default_branch": "main", + "stargazers_count": 5, + } + ] + } + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_client.__aenter__.return_value = mock_client + mock_client.get.return_value = mock_response + mock_client_class.return_value = mock_client + + repos = await service.search_repositories() + + # Verify the correct query was used + call_args = mock_client.get.call_args + assert call_args is not None + params = call_args[1]["params"] + assert "automations.yaml in:path" in params["q"] + assert "topic:" not in params["q"] + + # Verify repository was returned + assert len(repos) == 1 + assert repos[0]["name"] == "test-repo" + + +@pytest.mark.asyncio +async def test_search_repositories_with_topic_search(): + """Test that search_repositories uses topic query when no-topic search is disabled.""" + service = GitHubService(enable_no_topic_search=False) + + # Mock response with test data + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "items": [ + { + "name": "test-repo", + "owner": {"login": "testowner"}, + "description": "Test repository", + "html_url": "https://github.com/testowner/test-repo", + "default_branch": "main", + "stargazers_count": 5, + } + ] + } + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_client.__aenter__.return_value = mock_client + mock_client.get.return_value = mock_response + mock_client_class.return_value = mock_client + + repos = await service.search_repositories() + + # Verify the correct query was used (should be topic-based) + call_args = mock_client.get.call_args + assert call_args is not None + params = call_args[1]["params"] + assert "topic:" in params["q"] + + # Verify repository was returned + assert len(repos) == 1 + assert repos[0]["name"] == "test-repo" + + +@pytest.mark.asyncio +async def test_search_repositories_respects_max_repositories(): + """Test that search_repositories stops after reaching max_repositories limit.""" + service = GitHubService(enable_no_topic_search=True, max_repositories=2) + + # Mock response with 3 repos, but we should only get 2 + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "items": [ + { + "name": f"test-repo-{i}", + "owner": {"login": "testowner"}, + "description": f"Test repository {i}", + "html_url": f"https://github.com/testowner/test-repo-{i}", + "default_branch": "main", + "stargazers_count": 5, + } + for i in range(3) + ] + } + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_client.__aenter__.return_value = mock_client + mock_client.get.return_value = mock_response + mock_client_class.return_value = mock_client + + repos = await service.search_repositories() + + # Verify only 2 repositories were returned + assert len(repos) == 2 + + +@pytest.mark.asyncio +async def test_search_repositories_no_max_limit(): + """Test that search_repositories returns all repos when max_repositories is None.""" + service = GitHubService(enable_no_topic_search=True, max_repositories=None) + + # Mock response with 3 repos + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "items": [ + { + "name": f"test-repo-{i}", + "owner": {"login": "testowner"}, + "description": f"Test repository {i}", + "html_url": f"https://github.com/testowner/test-repo-{i}", + "default_branch": "main", + "stargazers_count": 5, + } + for i in range(3) + ] + } + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_client.__aenter__.return_value = mock_client + mock_client.get.return_value = mock_response + mock_client_class.return_value = mock_client + + repos = await service.search_repositories() + + # Verify all 3 repositories were returned + assert len(repos) == 3 + + +@pytest.mark.asyncio +async def test_indexing_service_passes_parameters(): + """Test that IndexingService correctly passes parameters to GitHubService.""" + indexer = IndexingService( + github_token="test_token", + enable_no_topic_search=True, + max_repositories=10, + ) + + assert indexer.github_service.enable_no_topic_search is True + assert indexer.github_service.max_repositories == 10 + assert indexer.github_service.token == "test_token" + + +@pytest.mark.asyncio +async def test_search_repositories_avoids_duplicates_in_no_topic_mode(): + """Test that no-topic search doesn't return duplicate repositories.""" + service = GitHubService(enable_no_topic_search=True) + + # Mock response with duplicate repos + mock_response_page1 = MagicMock() + mock_response_page1.status_code = 200 + mock_response_page1.json.return_value = { + "items": [ + { + "name": "test-repo", + "owner": {"login": "testowner"}, + "description": "Test repository", + "html_url": "https://github.com/testowner/test-repo", + "default_branch": "main", + "stargazers_count": 5, + } + ] + } + + # Second page returns same repo (unlikely but possible) + mock_response_page2 = MagicMock() + mock_response_page2.status_code = 200 + mock_response_page2.json.return_value = { + "items": [] # Empty to stop pagination + } + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_client.__aenter__.return_value = mock_client + mock_client.get.side_effect = [mock_response_page1, mock_response_page2] + mock_client_class.return_value = mock_client + + repos = await service.search_repositories() + + # Should only have one repo (no duplicates) + assert len(repos) == 1 + + +@pytest.mark.asyncio +async def test_environment_variable_parsing(): + """Test that environment variables are correctly parsed for boolean values.""" + import os + + # Test various truthy values + for value in ["true", "True", "TRUE", "1", "yes", "Yes"]: + os.environ["ENABLE_NO_TOPIC_SEARCH"] = value + result = os.getenv("ENABLE_NO_TOPIC_SEARCH", "false").lower() in ( + "true", + "1", + "yes", + ) + assert result is True, f"Failed for value: {value}" + + # Test various falsy values + for value in ["false", "False", "FALSE", "0", "no", "No", ""]: + os.environ["ENABLE_NO_TOPIC_SEARCH"] = value + result = os.getenv("ENABLE_NO_TOPIC_SEARCH", "false").lower() in ( + "true", + "1", + "yes", + ) + assert result is False, f"Failed for value: {value}" + + # Clean up + if "ENABLE_NO_TOPIC_SEARCH" in os.environ: + del os.environ["ENABLE_NO_TOPIC_SEARCH"] + + +@pytest.mark.asyncio +async def test_max_repositories_integer_parsing(): + """Test that MAX_REPOSITORIES environment variable is correctly parsed as integer.""" + import os + + # Test valid integer + os.environ["MAX_REPOSITORIES"] = "10" + max_repositories_str = os.getenv("MAX_REPOSITORIES") + max_repositories = int(max_repositories_str) if max_repositories_str else None + assert max_repositories == 10 + + # Test None when not set + if "MAX_REPOSITORIES" in os.environ: + del os.environ["MAX_REPOSITORIES"] + max_repositories_str = os.getenv("MAX_REPOSITORIES") + max_repositories = int(max_repositories_str) if max_repositories_str else None + assert max_repositories is None + + # Test empty string + os.environ["MAX_REPOSITORIES"] = "" + max_repositories_str = os.getenv("MAX_REPOSITORIES") + max_repositories = int(max_repositories_str) if max_repositories_str else None + assert max_repositories is None + + # Clean up + if "MAX_REPOSITORIES" in os.environ: + del os.environ["MAX_REPOSITORIES"] diff --git a/docker-compose.yml b/docker-compose.yml index fb68627..d212953 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,6 +14,10 @@ services: # Optional: Add your GitHub token for higher API rate limits - GITHUB_TOKEN=${GITHUB_TOKEN:-} - ENVIRONMENT=development + # Optional: Enable no-topic search for testing (default: false) + - ENABLE_NO_TOPIC_SEARCH=${ENABLE_NO_TOPIC_SEARCH:-false} + # Optional: Limit maximum repositories for testing (no limit if not set) + - MAX_REPOSITORIES=${MAX_REPOSITORIES:-} volumes: # Persist database across container restarts - api_data:/app/data From ae6ed1a66d163ed96bc9ae25c47185fc2001941b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 17 Jan 2026 07:56:15 +0000 Subject: [PATCH 3/4] Update documentation for no-topic search feature Co-authored-by: DevSecNinja <14926452+DevSecNinja@users.noreply.github.com> --- ARCHITECTURE.md | 6 +++++- README.md | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 9bdc9ea..5b9f1b6 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -44,7 +44,9 @@ SQLite Database ←→ GitHub API ### GitHub Service (`app/services/github_service.py`) -- Search repositories by topic via GitHub API +- Search repositories by topic via GitHub API (default: `hadiscover` or `ha-discover`) +- Optional no-topic search mode for testing/development (`ENABLE_NO_TOPIC_SEARCH`) +- Configurable repository limit for testing (`MAX_REPOSITORIES`) - Fetch file contents from repos - Discover automation files using path patterns - Optional GitHub token for higher rate limits (5k/hr vs 60/hr) @@ -140,6 +142,8 @@ Fault-tolerant YAML parsing handles varied Home Assistant configurations. Partia Only indexes repositories with explicit `hadiscover` topic. Respects privacy and user consent. +**Testing Mode**: For development and testing, can be configured to search without topic requirement using `ENABLE_NO_TOPIC_SEARCH=true` environment variable. A `MAX_REPOSITORIES` limit can also be set to control result size. + ### Hourly Scheduled Indexing APScheduler runs automatic indexing every hour. Manual trigger available in development mode only. diff --git a/README.md b/README.md index 5188a65..adfe427 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,18 @@ cd backend && source venv/bin/activate && pytest tests/ -v CI automatically tests Docker containers, API endpoints, and integration on every PR. +#### Testing Configuration (Optional) + +For testing and development purposes, you can enable no-topic search to find more repositories: + +```bash +# In backend/.env or as environment variables +ENABLE_NO_TOPIC_SEARCH=true # Search without requiring hadiscover topic +MAX_REPOSITORIES=10 # Limit results for testing (optional) +``` + +**Warning**: Keep `ENABLE_NO_TOPIC_SEARCH=false` in production to maintain the opt-in privacy model. + ### API Documentation OpenAPI/Swagger docs available at once running. From d1c49070a4de09408d279a951c28a7ea8d7f862b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 17 Jan 2026 07:57:34 +0000 Subject: [PATCH 4/4] Fix code review feedback: improve pagination loop control Co-authored-by: DevSecNinja <14926452+DevSecNinja@users.noreply.github.com> --- backend/app/services/github_service.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/backend/app/services/github_service.py b/backend/app/services/github_service.py index 43479d6..80ec2eb 100644 --- a/backend/app/services/github_service.py +++ b/backend/app/services/github_service.py @@ -98,6 +98,7 @@ async def search_repositories(self, per_page: int = 100) -> List[Dict]: # Search for repositories with automation files (no topic requirement) # Use a broad search for Home Assistant automation files page = 1 + reached_limit = False while True: try: url = f"{self.BASE_URL}/search/repositories" @@ -150,13 +151,11 @@ async def search_repositories(self, per_page: int = 100) -> List[Dict]: logger.info( f"Reached max repository limit: {self.max_repositories}" ) + reached_limit = True break - # Check if we've reached the maximum or there are no more pages - if ( - self.max_repositories is not None - and len(all_repositories) >= self.max_repositories - ) or len(items) < per_page: + # Stop pagination if we've reached the limit or there are no more pages + if reached_limit or len(items) < per_page: break page += 1