Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions DISCLAIMER.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
## ⚠️ Disclaimer

This project is for **educational and security research** purposes only.

- All API keys were scraped from **publicly available** sources (e.g. GitHub public repos).
- No private repositories were accessed or targeted.
- This is intended to raise awareness about sensitive data exposure.
- If any API key owner wants their data removed, please open an issue or contact directly.
- All real-looking keys are assumed to be expired, revoked, or IP-bound.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,4 +174,13 @@ python app.py

> “Nothing is organized here. That’s the fun part.” 😈

## ⚠️ Disclaimer

This project is for **educational and security research** purposes only.

- All API keys were scraped from **publicly available** sources (e.g. GitHub public repos).
- No private repositories were accessed or targeted.
- This is intended to raise awareness about sensitive data exposure.
- If any API key owner wants their data removed, please open an issue or contact directly.
- All real-looking keys are assumed to be expired, revoked, or IP-bound.

136 changes: 121 additions & 15 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,145 @@
```python
import os
import sys
from dotenv import load_dotenv
import re
import logging
from dotenv import load_dotenv
import json
from typing import Dict, Set, List, Optional

# -----------------------------------------------------------------------------
# sandvox.py
#
# This script searches for potentially sensitive information (like OpenAI API keys)
# within public GitHub repositories using the GitHub API. It loads environment
# variables for configuration and logs its activities.
# -----------------------------------------------------------------------------

# ✅ Load .env variables first
# --- Constants and Configuration ---

# Load environment variables from .env file (if it exists). This should be
# done as early as possible to make env vars available.
load_dotenv()

# === GitHub Token ===
# GitHub API Token - Retrieve from environment variables. This is critical
# for authenticating requests.
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
print("❌ GitHub token not found in environment variables.")
print("❌ GitHub token not found in environment variables. Please set GITHUB_TOKEN.")
sys.exit(1)

# === Headers for GitHub API ===

# GitHub API Headers - Used for authentication and specifying the
# expected response format. Using a dictionary for better readability.
HEADERS = {
"Authorization": f"token {GITHUB_TOKEN}",
"Accept": "application/vnd.github.v3+json"
}

SEARCH_KEYWORDS = {
# Search Keywords - Terms to search for within repository content. Using a
# set for efficient membership checking.
SEARCH_KEYWORDS: Set[str] = {
"openai api key",
"chatgpt key",
"openai secret",
"sk-"
"sk-" # Common prefix for OpenAI API keys.
}

REGEX_PATTERNS = {
"OpenAI API Key": re.compile(r"sk-[A-Za-z0-9]{32,}"),}

# Regular Expression Patterns - More sophisticated pattern matching to identify
# specific types of sensitive data. Using a dictionary to organize the patterns
# with descriptive keys.
REGEX_PATTERNS: Dict[str, re.Pattern] = {
"OpenAI API Key": re.compile(r"sk-[A-Za-z0-9]{32,}")
}

# === GitHub Search Settings ===
RESULTS_PER_PAGE = 30
MAX_PAGES = 4000
# GitHub Search Settings - Configure the GitHub API search behavior.
RESULTS_PER_PAGE = 100 # Max allowed by GitHub API is 100
MAX_PAGES = 10 # Reduced to avoid excessive API calls and potential rate limiting. Increase cautiously.

# === Storage Paths ===
# Storage Paths - File paths for storing results and logs.
RESULTS_FILE = "data/results.json"
LOG_FILE = "logs/activity.log"

# --- Logging Setup ---
# Configure logging to both a file and the console.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler(sys.stdout) # Log to console as well
]
)

# --- Helper Functions ---

def create_directory_if_not_exists(path: str):
"""
Creates a directory if it does not already exist. Handles potential
race conditions.
"""
directory = os.path.dirname(path)
if directory and not os.path.exists(directory):
try:
os.makedirs(directory)
except OSError as e:
logging.error(f"Failed to create directory {directory}: {e}")
# Consider re-raising the exception or exiting if directory creation fails
# as the script may not function correctly without it.


def save_results(results: List[Dict], filename: str):
"""
Saves search results to a JSON file.
"""
create_directory_if_not_exists(filename) # Ensure the directory exists
try:
with open(filename, "w") as f:
json.dump(results, f, indent=4)
logging.info(f"Results saved to {filename}")
except IOError as e:
logging.error(f"Error writing to {filename}: {e}")


def build_search_query(keywords: Set[str]) -> str:
"""
Constructs the GitHub search query string from a set of keywords.
"""
query_parts = [f'"{keyword}"' for keyword in keywords]
return " OR ".join(query_parts)

# --- Main Script Logic (Placeholder - to be implemented) ---
# This section would contain the main logic for:
# 1. Building the GitHub search query
# 2. Making API requests to GitHub.
# 3. Parsing the API responses.
# 4. Identifying sensitive information within the search results.
# 5. Saving the results and logging activity.


def main():
"""
Main function to orchestrate the search and reporting process.
"""
logging.info("Starting sandvox script...")

# Example: Construct a search query
search_query = build_search_query(SEARCH_KEYWORDS)
logging.info(f"Search query: {search_query}")

# Placeholder: Implement GitHub API interaction here.
# Example: Replace this with actual API calls and result processing.
# results = perform_github_search(search_query)
results = [] # Placeholder - replace with results from GitHub search.
# Example: Simulate some results for demonstration purposes.
if results:
save_results(results, RESULTS_FILE)
else:
logging.info("No results found.")

logging.info("Script finished.")


if __name__ == "__main__":
main()

```