CodeforNepal · binni979 · Mar 19, 2025 · Mar 19, 2025
diff --git a/app/chatbot.py b/app/chatbot.py
@@ -1,52 +1,81 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
+import logging
+import re
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from app.scraper import scrape_latest_news
 
-# Load the smaller pre-trained model and tokenizer from Hugging Face
-model_name = "EleutherAI/gpt-j-6B" 
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)
+# Configure logging
+logging.basicConfig(level=logging.ERROR)
 
-# Set device to GPU if available, else use CPU
+# Load model and tokenizer
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
+model_name = "distilgpt2"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
+
+# Set pad token if missing
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+
+def truncate_incomplete_articles(text):
+    """Ensure the last displayed article is fully complete."""
+    delimiter = "=" * 40
+    articles = text.split(delimiter)
+
+    valid_timestamp_regex = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
+    if not valid_timestamp_regex.search(articles[-1]):
+        articles.pop()
+
+    return delimiter.join(articles).strip()
+
 
-# Function to generate a chatbot response using the LLM with news context
 def generate_response(input_text):
     try:
-        # Get the latest news articles
-        news_articles = scrape_latest_news()
-
-        # Create a context string from the latest news
+        news_articles = scrape_latest_news(input_text)
+
+        if not isinstance(news_articles, list) or not all(isinstance(a, dict) for a in news_articles):
+            return "Received an unexpected response while fetching news."
+
+        if not news_articles:
+            return "Sorry, I couldn't find any recent news on this topic."
+
         news_context = "\n".join(
-            f"Title: {article['title']}\nDescription: {article['description']}\nURL: {article['url']}" 
+            f"Title: {article.get('title', 'N/A')}\nDescription: {article.get('description', 'N/A')}\nURL: {article.get('url', 'N/A')}\nSource: {article.get('source', 'N/A')}\nPublished At: {article.get('publishedAt', 'N/A')}\n{'=' * 40}"
             for article in news_articles
         )
-
-        # Combine the user's input with the news context
-        combined_input = f"User: {input_text}\n\nLatest News:\n{news_context}\n\nChatbot:"
 
-        # Encode input text
-        inputs = tokenizer(combined_input, return_tensors="pt").to(device)
+        combined_input = f"{news_context}\n\nChatbot:"
+        max_input_length = tokenizer.model_max_length  # Dynamically fetch model's limit
+
+        inputs = tokenizer(
+            combined_input,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=min(len(combined_input.split()), max_input_length)
+        ).to(device)
 
-        # Generate response
         outputs = model.generate(
             inputs.input_ids,
             attention_mask=inputs.attention_mask,
-            max_new_tokens=100,
-            temperature=0.9,  # Adjust temperature for diversity
-            top_p=0.95,       # Increase top-p for nucleus sampling
-            top_k=50,         # Increase top-k for more diverse outputs
+            max_new_tokens=150,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            top_k=40,
             pad_token_id=tokenizer.eos_token_id,
         )
+
         response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+        return truncate_incomplete_articles(response)
 
-        # Check if "Chatbot:" exists before splitting
-        if "Chatbot:" in response:
-            response_text = response.split("Chatbot:")[-1].strip()
-        else:
-            response_text = response.strip()  # Use the full response if "Chatbot:" is missing
-
-        return response_text
     except Exception as e:
-        return f"Error: {str(e)}"
+        logging.error(f"Error generating response: {str(e)}", exc_info=True)
+        return "Oops! Something went wrong while processing your request."
+
+
+# Example usage
+if __name__ == "__main__":
+    user_input = "latest technology news"
+    print(generate_response(user_input))
diff --git a/app/scraper.py b/app/scraper.py
@@ -1,86 +1,58 @@
-def scrape_latest_news():
-    # This function returns dummy news articles with detailed descriptions
-    articles = [
-        {
-            "title": "AI Breakthrough in Cancer Detection",
-            "description": "Scientists at Global Health Institute have developed a revolutionary AI system capable of detecting early signs of cancer with an accuracy of 98%. This advancement is expected to transform the medical industry by drastically reducing diagnostic errors and enabling earlier treatment interventions, potentially saving millions of lives annually.",
-            "url": "https://www.example.com/ai-cancer-detection",
-            "source": "Health Today",
-            "published_at": "2024-10-12T10:00:00Z"
-        },
-        {
-            "title": "Global Climate Summit Urges Immediate Action",
-            "description": "World leaders gathered at the 2024 Global Climate Summit to address the urgent need for sustainable energy solutions. With record-breaking heatwaves and natural disasters increasing worldwide, experts are calling for accelerated investments in renewable energy and stricter carbon emission regulations to mitigate the impact of climate change.",
-            "url": "https://www.example.com/global-climate-summit",
-            "source": "Eco News",
-            "published_at": "2024-10-11T14:45:00Z"
-        },
-        {
-            "title": "Nepal Celebrates Vijaya Dashami Amidst Challenges",
-            "description": "Prime Minister K P Sharma Oli and President Ram Chandra Paudel extend greetings to citizens during Vijaya Dashami, urging support for those impacted by recent floods and landslides. The festival's religious importance is being celebrated in parallel with ongoing recovery efforts in affected areas.",
-            "url": "https://www.example.com/nepal-dashami-celebration",
-            "source": "Devdiscourse",
-            "published_at": "2024-10-12T15:56:00Z"
-        },
-        {
-            "title": "Floods and Landslides Devastate Nepal",
-            "description": "Recent natural disasters in Nepal have caused 240 fatalities and displaced thousands, severely impacting infrastructure and disrupting travel during the Dashain festival. The government has allocated emergency funds for relief efforts, but the recovery process is expected to take months.",
-            "url": "https://www.example.com/nepal-floods-landslides",
-            "source": "Devdiscourse",
-            "published_at": "2024-10-12T14:30:00Z"
-        },
-        {
-            "title": "Global Tech Companies Eye Expansion in Africa",
-            "description": "Tech giants such as Google and Microsoft have announced plans to expand their presence in Africa, aiming to tap into the continent's rapidly growing digital economy. By investing in infrastructure, training, and local talent, these companies hope to establish Africa as a major hub for technology innovation in the coming years.",
-            "url": "https://www.example.com/tech-expansion-africa",
-            "source": "Tech World",
-            "published_at": "2024-10-10T09:00:00Z"
-        },
-        {
-            "title": "SpaceX Successfully Launches First All-Civilian Mission to Mars",
-            "description": "In a historic achievement, SpaceX launched its first all-civilian crewed mission to Mars. The crew, composed of scientists and engineers, will conduct groundbreaking research on the Martian surface. This mission marks a significant step toward SpaceX's goal of establishing a permanent human colony on Mars within the next decade.",
-            "url": "https://www.example.com/spacex-mars-mission",
-            "source": "Space News",
-            "published_at": "2024-10-09T18:30:00Z"
-        },
-        {
-            "title": "ç",
-            "description": "A team of researchers from MIT has developed a new battery technology capable of storing renewable energy for up to 10 times longer than current solutions. This breakthrough could solve one of the major challenges facing the adoption of solar and wind energy by providing reliable storage during periods of low production.",
-            "url": "https://www.example.com/renewable-energy-storage",
-            "source": "Science Daily",
-            "published_at": "2024-10-11T11:20:00Z"
-        },
-        {
-            "title": "Global Food Prices Surge Amidst Supply Chain Disruptions",
-            "description": "The global food supply chain continues to face unprecedented disruptions due to ongoing geopolitical conflicts and climate-related disasters. As a result, food prices have surged by 15% in the last quarter alone, placing significant strain on low-income populations and leading to increased calls for food security reforms.",
-            "url": "https://www.example.com/global-food-price-surge",
-            "source": "Financial Times",
-            "published_at": "2024-10-08T08:10:00Z"
-        },
-        {
-            "title": "Advancements in Quantum Computing Pave the Way for Breakthroughs in Medicine",
-            "description": "Quantum computing is set to revolutionize medical research, with new algorithms that can model complex molecular structures at an unprecedented scale. These advancements are expected to accelerate drug discovery processes and provide new insights into genetic diseases, opening the door for personalized medicine breakthroughs.",
-            "url": "https://www.example.com/quantum-computing-medicine",
-            "source": "Tech Insights",
-            "published_at": "2024-10-07T12:00:00Z"
-        },
-        {
-            "title": "World Cup 2024: The Rise of Underdogs",
-            "description": "The 2024 FIFA World Cup has seen unexpected victories from teams considered underdogs, sparking excitement among fans worldwide. With several top-ranked teams eliminated early, the tournament has become one of the most unpredictable and thrilling in recent memory, capturing the spirit of competition on the global stage.",
-            "url": "https://www.example.com/world-cup-2024-underdogs",
-            "source": "Sports Daily",
-            "published_at": "2024-10-12T17:45:00Z"
-        }
-    ]
-    return articles
+import requests
+from datetime import datetime, timedelta
 
-# Example usage for testing
-if __name__ == "__main__":
-    articles = scrape_latest_news()
-    for article in articles:
-        print(f"Title: {article['title']}")
-        print(f"Description: {article['description']}")
-        print(f"URL: {article['url']}")
-        print(f"Source: {article['source']}")
-        print(f"Published At: {article['published_at']}")
-        print("=" * 40)
+def scrape_latest_news(query="news"):
+    # Get today's date in YYYY-MM-DD format
+    today = datetime.today()
+    from_date = (today - timedelta(days=7)).strftime('%Y-%m-%d')
+    to_date = today.strftime('%Y-%m-%d')
+
+    # API Key from NewsAPI
+    api_key = 'api_key'
+    url = "https://newsapi.org/v2/everything"
+
+    # Define parameters for API request
+    params = {
+        "q": query,
+        "from": from_date,
+        "to": to_date,
+        "sortBy": "publishedAt",
+        "apiKey": api_key,
+        "language": "en",
+        "pageSize": 50,
+    }
+
+    try:
+        # Send GET request to NewsAPI
+        response = requests.get(url, params=params)
+        # print(f"Request URL: {response.url}")  # Debugging URL
+        if response.status_code == 200:
+            data = response.json()
+            # print(f"API Response: {data}")  # Debugging output
+
+            if "articles" in data and data["articles"]:
+                articles = data["articles"]
+                news_articles = []
+                for article in articles:
+                    # Extract necessary fields safely
+                    news_articles.append({
+                        'title': article.get('title', 'No title available'),
+                        'description': article.get('description', 'No description available'),
+                        'url': article.get('url', 'No URL available'),
+                        'source': article.get('source', {}).get('name', 'No source available'),
+                        'publishedAt': article.get('publishedAt', 'No published date available'),
+                        'author': article.get('author', 'Unknown Author'),
+                        'image': article.get('urlToImage', 'No image available'),
+                    })
+
+                return news_articles
+            else:
+                print("No articles found.")
+                return []
+        else:
+            print(f"Error fetching news: {response.status_code} - {response.text}")
+            return []
+
+    except Exception as e:
+        print(f"Error in scrape_latest_news: {str(e)}")
+        return []
diff --git a/run.py b/run.py
@@ -8,24 +8,33 @@ def gradio_chatbot(input_text):
     response = generate_response(input_text)
     return response
 
-# Initialize Gradio interface
+# Initialize Gradio interface with updated settings
 gradio_interface = gr.Interface(
     fn=gradio_chatbot,
-    inputs="text",
-    outputs="text"
+    inputs=gr.Textbox(
+        label="Enter your query here",
+        placeholder="Type something here...",
+        lines=3,
+        max_length=500
+    ),
+    outputs=[gr.Textbox(
+        label="Chatbot Response",
+        placeholder="The chatbot's response will appear here...",
+        lines=5,
+        interactive=False
+    )],
+    title="NewsBot Chatbot",
+    description="A chatbot that provides recent news articles based on your queries. Just type your topic and get the latest updates!",
+    allow_flagging="never",
+    live=False,
 )
 
-# Function to run Flask app
-def run_flask():
-    app.run(port=5000, debug=True, threaded=False)
-
 def run_gradio():
-    # Set share=True for public access
+    # Set share=False to keep the interface private
     gradio_interface.launch(share=False)
 
 if __name__ == "__main__":
     # Start Flask and Gradio in separate threads
-    flask_thread = Thread(target=run_flask)
+    flask_thread = Thread(target=run_gradio)
     flask_thread.start()
-    run_gradio()
-
+    app.run(port=5000, debug=False, threaded=False)