feat: Integrate fake news detection tool

google-labs-jules[bot] · google-labs-jules[bot] · commit 5c3b3861ace7 · 2025-09-23T09:22:03.000Z
This commit integrates a new fake news detection tool into the `social_media_analyzer`.

The new tool can be accessed through the main menu and allows users to analyze a URL for signs of fake news. The detection is based on a heuristic approach, which includes:
- Checking the URL against a list of known fake news domains.
- Analyzing the content of the URL for sensationalist keywords and clickbait patterns.

A new `fake_news_detector.py` module has been created to house the detection logic. The `heuristics.py` file has been updated with new lists for fake news detection. The `main.py` file has been updated to include the new menu option.

Unit tests have been added for the new functionality, and existing tests have been updated to use mocked HTTP responses for improved reliability.
diff --git a/social_media_analyzer/fake_news_detector.py b/social_media_analyzer/fake_news_detector.py
@@ -0,0 +1,63 @@
+import re
+import urllib.request
+from urllib.parse import urlparse
+from .heuristics import (
+    FAKE_NEWS_DOMAINS,
+    SENSATIONALIST_KEYWORDS,
+    CLICKBAIT_PATTERNS,
+    HEURISTIC_WEIGHTS
+)
+
+def analyze_url_for_fake_news(url):
+    """
+    Analyzes a URL for indicators of fake news.
+    """
+    if not url.startswith(('http://', 'https://')):
+        url = 'http://' + url
+
+    domain = urlparse(url).netloc.lower()
+
+    score = 0.0
+    indicators_found = []
+
+    # 1. Check against known fake news domains
+    if domain in FAKE_NEWS_DOMAINS:
+        score += HEURISTIC_WEIGHTS.get("KNOWN_FAKE_NEWS_DOMAIN", 5.0)
+        indicators_found.append(f"Domain '{domain}' is a known source of fake news.")
+        return {
+            "url": url,
+            "score": round(score, 2),
+            "indicators_found": indicators_found
+        }
+
+    # 2. Fetch and analyze content
+    try:
+        headers = {'User-Agent': 'Mozilla/5.0'}
+        request = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(request, timeout=10) as response:
+            if response.status == 200:
+                html_content = response.read().decode('utf-8', errors='ignore')
+                text_content = re.sub(r'<[^>]+>', '', html_content).lower()
+
+                # 3. Analyze text for sensationalist keywords
+                for keyword in SENSATIONALIST_KEYWORDS:
+                    if keyword in text_content:
+                        score += HEURISTIC_WEIGHTS.get("SENSATIONALIST_KEYWORD", 1.0)
+                        indicators_found.append(f"Found sensationalist keyword: '{keyword}'")
+
+                # 4. Analyze text for clickbait patterns
+                for pattern in CLICKBAIT_PATTERNS:
+                    if re.search(pattern, text_content, re.IGNORECASE):
+                        score += HEURISTIC_WEIGHTS.get("CLICKBAIT_PATTERN", 1.5)
+                        indicators_found.append(f"Found clickbait pattern: '{pattern}'")
+
+            else:
+                return {"error": f"Failed to fetch URL: HTTP status code {response.status}"}
+    except Exception as e:
+        return {"error": f"An error occurred: {e}"}
+
+    return {
+        "url": url,
+        "score": round(score, 2),
+        "indicators_found": indicators_found
+    }
diff --git a/social_media_analyzer/heuristics.py b/social_media_analyzer/heuristics.py
@@ -123,6 +123,27 @@
 ]
 
 
+# --- Fake News Heuristics ---
+
+FAKE_NEWS_DOMAINS = [
+    "abcnews.com.co", "cnn.com.de", "daily-mail.com.de",
+    "infowars.com", "naturalnews.com", "breitbart.com",
+    "worldnewsdailyreport.com", "theonion.com", # The Onion is satire, but often mistaken for real news
+    "yournewswire.com", "davidwolfe.com"
+]
+
+SENSATIONALIST_KEYWORDS = [
+    "shocking", "bombshell", "secret", "cover-up",
+    "miracle", "cure", "unbelievable", "outrageous",
+    "conspiracy", "hidden truth", "what they don't want you to know", "fake news"
+]
+
+CLICKBAIT_PATTERNS = [
+    r"you won't believe", r"will shock you", r"number \d will amaze you",
+    r"this one weird trick", r"doctors hate him", r"the truth about",
+    r"scientists baffled", r"what happened next", r"secret to"
+]
+
 # --- Regular Expression Patterns ---
 
 # Basic URL detection
@@ -201,6 +222,9 @@ def generate_suspicious_url_patterns(legitimate_domains):
 
 # --- Scoring Weights ---
 HEURISTIC_WEIGHTS = {
+    "KNOWN_FAKE_NEWS_DOMAIN": 5.0,
+    "SENSATIONALIST_KEYWORD": 1.0,
+    "CLICKBAIT_PATTERN": 1.5,
     "URGENCY": 1.5,
     "SENSITIVE_INFO": 2.5,
     "TOO_GOOD_TO_BE_TRUE": 2.0,
diff --git a/social_media_analyzer/main.py b/social_media_analyzer/main.py
@@ -1,5 +1,26 @@
 from . import fake_profile_detector
 from . import scam_detector
+from . import fake_news_detector
+
+def analyze_news_url():
+    """Analyzes a news URL for potential fake news."""
+    url_to_check = input("Please enter the full URL of the news article you want to analyze: ").strip()
+    if not url_to_check:
+        print("No URL entered.")
+        return
+
+    print("\n--- Analyzing News URL for Fake News ---")
+    result = fake_news_detector.analyze_url_for_fake_news(url_to_check)
+
+    if "error" in result:
+        print(f"Could not analyze news URL: {result['error']}")
+    elif not result.get("indicators_found"):
+        print("No specific fake news indicators were found.")
+    else:
+        print(f"Score: {result['score']} (Higher is more suspicious)")
+        print("Indicators Found:")
+        for indicator in result['indicators_found']:
+            print(f"- {indicator}")
 
 def analyze_website_url():
     """Analyzes a website URL for potential scams."""
@@ -107,21 +128,24 @@ def analyze_social_media():
 def main():
     """Main function to run the security analyzer."""
     print("--- Universal Security Analyzer ---")
-    print("This tool helps you analyze social media, messages, and websites for potential scams.")
+    print("This tool helps you analyze social media, messages, and websites for potential scams and fake news.")
 
     while True:
         print("\n--- Main Menu ---")
         print("1. Analyze a Social Media Platform")
-        print("2. Analyze a Website URL")
-        print("3. Exit")
+        print("2. Analyze a Website URL for Scams")
+        print("3. Analyze a News URL for Fake News")
+        print("4. Exit")
 
         try:
-            choice = int(input("Enter your choice (1-3): "))
+            choice = int(input("Enter your choice (1-4): "))
             if choice == 1:
                 analyze_social_media()
             elif choice == 2:
                 analyze_website_url()
             elif choice == 3:
+                analyze_news_url()
+            elif choice == 4:
                 print("Exiting. Stay safe!")
                 break
             else:
diff --git a/social_media_analyzer/test_fake_news_detector.py b/social_media_analyzer/test_fake_news_detector.py
@@ -0,0 +1,34 @@
+import unittest
+from unittest.mock import patch, Mock
+from .fake_news_detector import analyze_url_for_fake_news
+
+class TestFakeNewsDetector(unittest.TestCase):
+
+    @patch('urllib.request.urlopen')
+    def test_fake_news_url(self, mock_urlopen):
+        # Mock the response for a fake news URL
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.read.return_value = b'<html><head><title>Fake News</title></head><body>This is a shocking story!</body></html>'
+        mock_urlopen.return_value.__enter__.return_value = mock_response
+
+        url = "http://abcnews.com.co/news/breaking-news-report.html"
+        result = analyze_url_for_fake_news(url)
+        self.assertGreater(result["score"], 0)
+        self.assertIn("Domain 'abcnews.com.co' is a known source of fake news.", result["indicators_found"])
+
+    @patch('urllib.request.urlopen')
+    def test_legitimate_news_url(self, mock_urlopen):
+        # Mock the response for a legitimate news URL
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.read.return_value = b'<html><head><title>Real News</title></head><body>This is a real news story.</body></html>'
+        mock_urlopen.return_value.__enter__.return_value = mock_response
+
+        url = "https://www.bbc.com/news"
+        result = analyze_url_for_fake_news(url)
+        self.assertEqual(result["score"], 0)
+        self.assertEqual(len(result["indicators_found"]), 0)
+
+if __name__ == '__main__':
+    unittest.main()