devavijitsingh · devavijitsingh · Aug 2, 2025 · Aug 2, 2025 · Aug 2, 2025 · Aug 5, 2025
diff --git a/DISCLAIMER.md b/DISCLAIMER.md
@@ -0,0 +1,9 @@
+## ⚠️ Disclaimer
+
+This project is for **educational and security research** purposes only.
+
+- All API keys were scraped from **publicly available** sources (e.g. GitHub public repos).
+- No private repositories were accessed or targeted.
+- This is intended to raise awareness about sensitive data exposure.
+- If any API key owner wants their data removed, please open an issue or contact directly.
+- All real-looking keys are assumed to be expired, revoked, or IP-bound.
diff --git a/README.md b/README.md
@@ -174,4 +174,13 @@ python app.py
 
 > “Nothing is organized here. That’s the fun part.” 😈
 
+## ⚠️ Disclaimer
+
+This project is for **educational and security research** purposes only.
+
+- All API keys were scraped from **publicly available** sources (e.g. GitHub public repos).
+- No private repositories were accessed or targeted.
+- This is intended to raise awareness about sensitive data exposure.
+- If any API key owner wants their data removed, please open an issue or contact directly.
+- All real-looking keys are assumed to be expired, revoked, or IP-bound.
 
diff --git a/config.py b/config.py
@@ -1,39 +1,145 @@
+```python
 import os
 import sys
-from dotenv import load_dotenv
 import re
+import logging
+from dotenv import load_dotenv
+import json
+from typing import Dict, Set, List, Optional
+
+# -----------------------------------------------------------------------------
+# sandvox.py
+#
+# This script searches for potentially sensitive information (like OpenAI API keys)
+# within public GitHub repositories using the GitHub API. It loads environment
+# variables for configuration and logs its activities.
+# -----------------------------------------------------------------------------
 
-# ✅ Load .env variables first
+# --- Constants and Configuration ---
+
+# Load environment variables from .env file (if it exists).  This should be
+# done as early as possible to make env vars available.
 load_dotenv()
 
-# === GitHub Token ===
+# GitHub API Token - Retrieve from environment variables.  This is critical
+# for authenticating requests.
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 if not GITHUB_TOKEN:
-    print("❌ GitHub token not found in environment variables.")
+    print("❌ GitHub token not found in environment variables.  Please set GITHUB_TOKEN.")
     sys.exit(1)
 
-# === Headers for GitHub API ===
-
+# GitHub API Headers -  Used for authentication and specifying the
+# expected response format.  Using a dictionary for better readability.
 HEADERS = {
     "Authorization": f"token {GITHUB_TOKEN}",
     "Accept": "application/vnd.github.v3+json"
 }
 
-SEARCH_KEYWORDS = {
+# Search Keywords - Terms to search for within repository content.  Using a
+# set for efficient membership checking.
+SEARCH_KEYWORDS: Set[str] = {
     "openai api key",
     "chatgpt key",
     "openai secret",
-    "sk-"
+    "sk-"  # Common prefix for OpenAI API keys.
 }
 
-REGEX_PATTERNS = {
-    "OpenAI API Key": re.compile(r"sk-[A-Za-z0-9]{32,}"),}
-
+# Regular Expression Patterns -  More sophisticated pattern matching to identify
+# specific types of sensitive data. Using a dictionary to organize the patterns
+# with descriptive keys.
+REGEX_PATTERNS: Dict[str, re.Pattern] = {
+    "OpenAI API Key": re.compile(r"sk-[A-Za-z0-9]{32,}")
+}
 
-# === GitHub Search Settings ===
-RESULTS_PER_PAGE = 30
-MAX_PAGES = 4000
+# GitHub Search Settings -  Configure the GitHub API search behavior.
+RESULTS_PER_PAGE = 100  # Max allowed by GitHub API is 100
+MAX_PAGES = 10  # Reduced to avoid excessive API calls and potential rate limiting.  Increase cautiously.
 
-# === Storage Paths ===
+# Storage Paths -  File paths for storing results and logs.
 RESULTS_FILE = "data/results.json"
 LOG_FILE = "logs/activity.log"
+
+# --- Logging Setup ---
+# Configure logging to both a file and the console.
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.FileHandler(LOG_FILE),
+        logging.StreamHandler(sys.stdout)  # Log to console as well
+    ]
+)
+
+# --- Helper Functions ---
+
+def create_directory_if_not_exists(path: str):
+    """
+    Creates a directory if it does not already exist.  Handles potential
+    race conditions.
+    """
+    directory = os.path.dirname(path)
+    if directory and not os.path.exists(directory):
+        try:
+            os.makedirs(directory)
+        except OSError as e:
+            logging.error(f"Failed to create directory {directory}: {e}")
+            # Consider re-raising the exception or exiting if directory creation fails
+            # as the script may not function correctly without it.
+
+
+def save_results(results: List[Dict], filename: str):
+    """
+    Saves search results to a JSON file.
+    """
+    create_directory_if_not_exists(filename)  # Ensure the directory exists
+    try:
+        with open(filename, "w") as f:
+            json.dump(results, f, indent=4)
+        logging.info(f"Results saved to {filename}")
+    except IOError as e:
+        logging.error(f"Error writing to {filename}: {e}")
+
+
+def build_search_query(keywords: Set[str]) -> str:
+    """
+    Constructs the GitHub search query string from a set of keywords.
+    """
+    query_parts = [f'"{keyword}"' for keyword in keywords]
+    return " OR ".join(query_parts)
+
+# --- Main Script Logic (Placeholder - to be implemented) ---
+# This section would contain the main logic for:
+# 1. Building the GitHub search query
+# 2. Making API requests to GitHub.
+# 3. Parsing the API responses.
+# 4. Identifying sensitive information within the search results.
+# 5.  Saving the results and logging activity.
+
+
+def main():
+    """
+    Main function to orchestrate the search and reporting process.
+    """
+    logging.info("Starting sandvox script...")
+
+    # Example: Construct a search query
+    search_query = build_search_query(SEARCH_KEYWORDS)
+    logging.info(f"Search query: {search_query}")
+
+    # Placeholder:  Implement GitHub API interaction here.
+    # Example:  Replace this with actual API calls and result processing.
+    # results = perform_github_search(search_query)
+    results = []  # Placeholder - replace with results from GitHub search.
+    # Example:  Simulate some results for demonstration purposes.
+    if results:
+        save_results(results, RESULTS_FILE)
+    else:
+        logging.info("No results found.")
+
+    logging.info("Script finished.")
+
+
+if __name__ == "__main__":
+    main()
+
+```