|
| 1 | +<!-- hash:f243687755be8a55512bfc36c597330dad9bc62da95ce00930838c676222121a --> |
| 2 | +# Code Review for check_files.py |
| 3 | + |
| 4 | +```python |
| 5 | +#!/usr/bin/env python3 |
| 6 | +import os |
| 7 | +import re |
| 8 | +import subprocess |
| 9 | +import sys |
| 10 | +import shutil |
| 11 | + |
| 12 | +# ------------------------------- |
| 13 | +# Carpetas a ignorar |
| 14 | +# ------------------------------- |
| 15 | +IGNORE_DIRS = {"venv", ".git", "node_modules", "__pycache__", ".mypy_cache"} |
| 16 | + |
| 17 | +# ------------------------------- |
| 18 | +# Filtros por stack para limpiar salida |
| 19 | +# ------------------------------- |
| 20 | +FILTERS_BY_STACK = { |
| 21 | + "django": [ |
| 22 | + (r"(SECRET_KEY\s*=\s*['\"].*?['\"])", "SECRET_KEY = '***REDACTED***'"), |
| 23 | + (r"(PASSWORD\s*=\s*['\"].*?['\"])", "PASSWORD = '***REDACTED***'"), |
| 24 | + (r"(API_KEY\s*=\s*['\"].*?['\"])", "API_KEY = '***REDACTED***'"), |
| 25 | + (r"(DEBUG\s*=\s*True)", r"\1 # DEV MODE: No usar en producción"), |
| 26 | + (r"(ALLOWED_HOSTS\s*=\s*\[.*?\])", "ALLOWED_HOSTS = ['*'] # DEV ONLY") |
| 27 | + ], |
| 28 | + "flask": [ |
| 29 | + (r"(SECRET_KEY\s*=\s*['\"].*?['\"])", "SECRET_KEY = '***REDACTED***'"), |
| 30 | + (r"(SQLALCHEMY_DATABASE_URI\s*=\s*['\"].*?['\"])", "SQLALCHEMY_DATABASE_URI = '***REDACTED***'"), |
| 31 | + (r"(DEBUG\s*=\s*True)", r"\1 # DEV MODE: No usar en producción") |
| 32 | + ], |
| 33 | + "node": [ |
| 34 | + (r"(process\.env\.(?:[A-Z_]+_?KEY|PASSWORD|TOKEN|SECRET)[^\n]*)", "/* ***REDACTED*** */"), |
| 35 | + (r"(['\"](?:AIza|sk-|ghp_)[A-Za-z0-9_\-]+['\"])", "'***REDACTED***'"), |
| 36 | + (r"(app\.listen\(\s*\d+\s*\))", r"\1 // DEV PORT, ajustar en producción") |
| 37 | + ], |
| 38 | + "react": [ |
| 39 | + (r"(process\.env\.REACT_APP_[A-Z0-9_]+)", "/* ***REDACTED*** */"), |
| 40 | + (r"(https?:\/\/[^\s'\"]+\/api[^\s'\"]*)", "'***R |
| 41 | +``` |
| 42 | + |
| 43 | +**Explanation:** |
| 44 | + |
| 45 | +This Python script is designed to sanitize code in a project directory, primarily by redacting sensitive information like API keys, passwords, and secrets before sharing or committing the code to a public repository. It aims to prevent accidental exposure of credentials. |
| 46 | + |
| 47 | +Here's a breakdown of the code's functionality: |
| 48 | + |
| 49 | +1. **Imports:** |
| 50 | + - `os`: Provides functions for interacting with the operating system (e.g., navigating directories, listing files). |
| 51 | + - `re`: Enables regular expression operations for pattern matching and replacement in strings. This is crucial for finding and redacting sensitive data. |
| 52 | + - `subprocess`: Allows running external commands (like `git status`) and capturing their output. |
| 53 | + - `sys`: Provides access to system-specific parameters and functions (e.g., command-line arguments). |
| 54 | + - `shutil`: Offers high-level file operations (e.g., copying files and trees). While imported, it's not directly used in the snippet provided. It's likely used in the full script, but the provided section doesn't show that usage. |
| 55 | + |
| 56 | +2. **`IGNORE_DIRS`:** |
| 57 | + - `IGNORE_DIRS = {"venv", ".git", "node_modules", "__pycache__", ".mypy_cache"}`: This is a set of directory names that the script will skip when traversing the project directory. These directories typically contain: |
| 58 | + - `venv`: Python virtual environments (contains dependencies). |
| 59 | + - `.git`: The Git repository directory (contains version control information). |
| 60 | + - `node_modules`: Node.js dependencies (often very large). |
| 61 | + - `__pycache__`: Python bytecode cache directories. |
| 62 | + - `.mypy_cache`: MyPy cache directory |
| 63 | + |
| 64 | + Ignoring these directories speeds up the process and prevents the script from accidentally modifying files within them, which could break dependencies or version control. |
| 65 | + |
| 66 | +3. **`FILTERS_BY_STACK`:** |
| 67 | + - This is a dictionary that holds regular expression-based filters for different technology stacks (e.g., "django", "flask", "node", "react"). The filters are used to identify and replace sensitive information in code files. |
| 68 | + - Each key in the dictionary represents a technology stack. |
| 69 | + - Each value is a list of tuples, where each tuple contains: |
| 70 | + - A regular expression pattern (as a string). |
| 71 | + - A replacement string. |
| 72 | + - **Example (Django):** |
| 73 | + - `(r"(SECRET_KEY\s*=\s*['\"].*?['\"])", "SECRET_KEY = '***REDACTED***'")` |
| 74 | + - This regular expression looks for lines of code that define a `SECRET_KEY` variable. Specifically: |
| 75 | + - `SECRET_KEY`: Matches the literal string "SECRET_KEY". |
| 76 | + - `\s*=\s*`: Matches zero or more whitespace characters followed by an equals sign followed by zero or more whitespace characters. |
| 77 | + - `['\"]`: Matches either a single quote or a double quote. |
| 78 | + - `.*?`: Matches any character (except newline) zero or more times, but as few times as possible (non-greedy). This matches the actual key. |
| 79 | + - `['\"]`: Matches the closing single or double quote. |
| 80 | + - The replacement string replaces the entire matched line with `SECRET_KEY = '***REDACTED***'`, effectively obscuring the actual secret key. |
| 81 | + - `(r"(DEBUG\s*=\s*True)", r"\1 # DEV MODE: No usar en producción")` |
| 82 | + - This regex finds the line `DEBUG = True`. The `\1` in the replacement string refers to the first captured group (in this case, the entire matched string: `DEBUG = True`). This adds a comment warning not to use the debug mode in production. |
| 83 | + - `(r"(ALLOWED_HOSTS\s*=\s*\[.*?\])", "ALLOWED_HOSTS = ['*'] # DEV ONLY")` |
| 84 | + - This regex replaces the `ALLOWED_HOSTS` setting with `ALLOWED_HOSTS = ['*']`. This is a common development setting, but is extremely insecure for production as it allows any host to access the application. The comment highlights that this is for development only. |
| 85 | +
|
| 86 | + - **Other Stacks:** Similar filters are defined for Flask, Node.js, and React, targeting common places where secrets and sensitive data might be stored. The Node filters look for environment variables containing keys, passwords, tokens, or secrets, and also specifically looks for common API key prefixes (like "AIza", "sk-", "ghp_"). The React filters target environment variables prefixed with `REACT_APP_` and API endpoints. |
| 87 | +
|
| 88 | +**How the script is *likely* used (based on common patterns):** |
| 89 | +
|
| 90 | +The provided code snippet represents the **definition** of the filters and the ignored directories. A larger script would typically: |
| 91 | +
|
| 92 | +1. **Take a directory as input** (using `sys.argv` to get a command-line argument). |
| 93 | +2. **Walk through the directory tree** (using `os.walk`). |
| 94 | +3. **Identify the technology stack** used in the project (likely through heuristics like checking for certain files, e.g., `manage.py` for Django, `package.json` for Node.js). |
| 95 | +4. **Based on the detected stack, apply the appropriate filters** from `FILTERS_BY_STACK` to each file: |
| 96 | + - Read the file content. |
| 97 | + - Iterate through the regex patterns in the stack's filter list. |
| 98 | + - Use `re.sub()` to find and replace matches with the corresponding replacement string. |
| 99 | + - Write the modified content back to the file. |
| 100 | +5. **Optionally, create a new sanitized directory** (copying the original and modifying in place). This is where `shutil` would be used. |
| 101 | +
|
| 102 | +**In summary:** |
| 103 | +
|
| 104 | +This script is a code sanitizer that helps prevent the accidental disclosure of sensitive information by automatically redacting or modifying potentially sensitive data within a project's codebase. It uses regular expressions and stack-specific filters to target common locations where secrets are stored. It's a crucial tool for security-conscious developers who need to share or publish their code without exposing their credentials. |
0 commit comments