Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ SQLite Database ←→ GitHub API

### GitHub Service (`app/services/github_service.py`)

- Search repositories by topic via GitHub API
- Search repositories by topic via GitHub API (default: `hadiscover` or `ha-discover`)
- Optional no-topic search mode for testing/development (`ENABLE_NO_TOPIC_SEARCH`)
- Configurable repository limit for testing (`MAX_REPOSITORIES`)
- Fetch file contents from repos
- Discover automation files using path patterns
- Optional GitHub token for higher rate limits (5k/hr vs 60/hr)
Expand Down Expand Up @@ -140,6 +142,8 @@ Fault-tolerant YAML parsing handles varied Home Assistant configurations. Partia

Only indexes repositories with explicit `hadiscover` topic. Respects privacy and user consent.

**Testing Mode**: For development and testing, can be configured to search without topic requirement using `ENABLE_NO_TOPIC_SEARCH=true` environment variable. A `MAX_REPOSITORIES` limit can also be set to control result size.

### Hourly Scheduled Indexing

APScheduler runs automatic indexing every hour. Manual trigger available in development mode only.
Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,18 @@ cd backend && source venv/bin/activate && pytest tests/ -v

CI automatically tests Docker containers, API endpoints, and integration on every PR.

#### Testing Configuration (Optional)

For testing and development purposes, you can enable no-topic search to find more repositories:

```bash
# In backend/.env or as environment variables
ENABLE_NO_TOPIC_SEARCH=true # Search without requiring hadiscover topic
MAX_REPOSITORIES=10 # Limit results for testing (optional)
```

**Warning**: Keep `ENABLE_NO_TOPIC_SEARCH=false` in production to maintain the opt-in privacy model.

### API Documentation

OpenAPI/Swagger docs available at <http://localhost:8000/docs> once running.
Expand Down
12 changes: 12 additions & 0 deletions backend/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,15 @@ ENVIRONMENT=production
# Example: ROOT_PATH=/api/v1 if your cloud platform handles routing
# and you want the app accessible at the root URL
ROOT_PATH=

# No-Topic Search Configuration (optional, for testing)
# Set to "true" to search for automation files without requiring the "hadiscover" topic
# This is useful for testing scenarios to find more diverse repositories
# WARNING: Keep this "false" in production to maintain opt-in privacy
ENABLE_NO_TOPIC_SEARCH=false

# Maximum Repositories Configuration (optional, for testing)
# Set a maximum number of repositories to index (e.g., 10 for testing)
# Leave empty or unset for no limit
# This is useful for testing to avoid performance issues with large result sets
MAX_REPOSITORIES=
12 changes: 11 additions & 1 deletion backend/app/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,17 @@ async def trigger_indexing(

async def run_indexing():
"""Background task to run indexing."""
indexer = IndexingService()
# Get no-topic search configuration
enable_no_topic_search = os.getenv(
"ENABLE_NO_TOPIC_SEARCH", "false"
).lower() in ("true", "1", "yes")
max_repositories_str = os.getenv("MAX_REPOSITORIES")
max_repositories = int(max_repositories_str) if max_repositories_str else None

indexer = IndexingService(
enable_no_topic_search=enable_no_topic_search,
max_repositories=max_repositories,
)
# Create a new session for background task
from app.models import SessionLocal

Expand Down
22 changes: 21 additions & 1 deletion backend/app/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,28 @@ async def run_indexing():
if not github_token:
logger.warning("GITHUB_TOKEN not set - API rate limits will be lower")

# Get no-topic search configuration
enable_no_topic_search = os.getenv("ENABLE_NO_TOPIC_SEARCH", "false").lower() in (
"true",
"1",
"yes",
)
max_repositories_str = os.getenv("MAX_REPOSITORIES")
max_repositories = int(max_repositories_str) if max_repositories_str else None

if enable_no_topic_search:
logger.info("No-topic search enabled")
if max_repositories:
logger.info(f"Maximum repositories limit: {max_repositories}")
else:
logger.info("Topic-based search enabled (default)")

# Create indexing service
indexer = IndexingService(github_token=github_token)
indexer = IndexingService(
github_token=github_token,
enable_no_topic_search=enable_no_topic_search,
max_repositories=max_repositories,
)

# Get database session (this also initializes the database)
db = get_db_session()
Expand Down
115 changes: 104 additions & 11 deletions backend/app/services/github_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,23 @@ class GitHubService:
"ha-discover",
] # Support both topics for backwards compatibility

def __init__(self, token: Optional[str] = None):
"""Initialize GitHub service with optional authentication token."""
def __init__(
self,
token: Optional[str] = None,
enable_no_topic_search: bool = False,
max_repositories: Optional[int] = None,
):
"""
Initialize GitHub service with optional authentication token.

Args:
token: GitHub personal access token
enable_no_topic_search: If True, search for automation files without topic requirement
max_repositories: Maximum number of repositories to return (None = no limit)
"""
self.token = token or os.getenv("GITHUB_TOKEN")
self.enable_no_topic_search = enable_no_topic_search
self.max_repositories = max_repositories
self.headers = {
"Accept": "application/vnd.github.v3+json",
}
Expand Down Expand Up @@ -68,6 +82,7 @@ def _check_rate_limit(self, response: httpx.Response, operation: str) -> None:
async def search_repositories(self, per_page: int = 100) -> List[Dict]:
"""
Search for repositories with the hadiscover or ha-discover topics.
If enable_no_topic_search is True, search for automation files without topic requirement.

Args:
per_page: Number of results per page (max 100)
Expand All @@ -79,14 +94,16 @@ async def search_repositories(self, per_page: int = 100) -> List[Dict]:
seen_repos = set() # Track repos to avoid duplicates

async with httpx.AsyncClient() as client:
# Search for each topic
for topic in self.SEARCH_TOPICS:
if self.enable_no_topic_search:
# Search for repositories with automation files (no topic requirement)
# Use a broad search for Home Assistant automation files
page = 1
reached_limit = False
while True:
try:
url = f"{self.BASE_URL}/search/repositories"
params = {
"q": f"topic:{topic}",
"q": "automations.yaml in:path",
"per_page": per_page,
"page": page,
}
Expand Down Expand Up @@ -126,21 +143,97 @@ async def search_repositories(self, per_page: int = 100) -> List[Dict]:
}
)

# Check if there are more pages
if len(items) < per_page:
# Check if we've reached the maximum number of repositories
if (
self.max_repositories is not None
and len(all_repositories) >= self.max_repositories
):
logger.info(
f"Reached max repository limit: {self.max_repositories}"
)
reached_limit = True
break

# Stop pagination if we've reached the limit or there are no more pages
if reached_limit or len(items) < per_page:
break

page += 1

except httpx.HTTPError as e:
logger.error(
f"Error searching repositories with topic '{topic}': {e}"
f"Error searching repositories with automation files: {e}"
)
break

logger.info(
f"Found {len(all_repositories)} repositories with topics {self.SEARCH_TOPICS}"
)
logger.info(
f"Found {len(all_repositories)} repositories with automation files (no topic search)"
)
else:
# Original topic-based search
# Search for each topic
for topic in self.SEARCH_TOPICS:
page = 1
while True:
try:
url = f"{self.BASE_URL}/search/repositories"
params = {
"q": f"topic:{topic}",
"per_page": per_page,
"page": page,
}

response = await client.get(
url, headers=self.headers, params=params, timeout=30.0
)

# Check for rate limiting (status 429 or 403 with rate limit message)
self._check_rate_limit(response, "search_repositories")

response.raise_for_status()

data = response.json()
items = data.get("items", [])

if not items:
break

for repo in items:
repo_key = f"{repo['owner']['login']}/{repo['name']}"
# Skip if we've already seen this repo
if repo_key in seen_repos:
continue

seen_repos.add(repo_key)
all_repositories.append(
{
"name": repo["name"],
"owner": repo["owner"]["login"],
"description": repo.get("description", ""),
"url": repo["html_url"],
"default_branch": repo.get(
"default_branch", "main"
),
"stars": repo.get("stargazers_count", 0),
}
)

# Check if there are more pages
if len(items) < per_page:
break

page += 1

except httpx.HTTPError as e:
logger.error(
f"Error searching repositories with topic '{topic}': {e}"
)
break

logger.info(
f"Found {len(all_repositories)} repositories with topics {self.SEARCH_TOPICS}"
)

return all_repositories

async def get_file_content(
Expand Down
22 changes: 19 additions & 3 deletions backend/app/services/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,25 @@
class IndexingService:
"""Service for indexing Home Assistant automations from GitHub repositories."""

def __init__(self, github_token: Optional[str] = None):
"""Initialize indexing service with GitHub API access."""
self.github_service = GitHubService(token=github_token)
def __init__(
self,
github_token: Optional[str] = None,
enable_no_topic_search: bool = False,
max_repositories: Optional[int] = None,
):
"""
Initialize indexing service with GitHub API access.

Args:
github_token: GitHub personal access token
enable_no_topic_search: If True, search for automation files without topic requirement
max_repositories: Maximum number of repositories to index (None = no limit)
"""
self.github_service = GitHubService(
token=github_token,
enable_no_topic_search=enable_no_topic_search,
max_repositories=max_repositories,
)
self.parser = AutomationParser()

async def index_repositories(self, db: Session) -> dict:
Expand Down
13 changes: 12 additions & 1 deletion backend/app/services/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,20 @@ async def run_indexing_task(self):
if not github_token:
logger.warning("GITHUB_TOKEN not set - API rate limits will be lower")

# Get no-topic search configuration
enable_no_topic_search = os.getenv(
"ENABLE_NO_TOPIC_SEARCH", "false"
).lower() in ("true", "1", "yes")
max_repositories_str = os.getenv("MAX_REPOSITORIES")
max_repositories = int(max_repositories_str) if max_repositories_str else None

# Create indexing service if not already created
if not self.indexer:
self.indexer = IndexingService(github_token=github_token)
self.indexer = IndexingService(
github_token=github_token,
enable_no_topic_search=enable_no_topic_search,
max_repositories=max_repositories,
)

# Get database session
db = self._get_db()
Expand Down
Loading
Loading