DevTechCollective · bwoody13 · Jan 6, 2024 · Jan 6, 2024 · Jan 7, 2024 · Jan 7, 2024
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,33 @@
+{
+  "name": "Python 3",
+  // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+  "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
+  "customizations": {
+    "codespaces": {
+      "openFiles": [
+        "README.md",
+        "app.py"
+      ]
+    },
+    "vscode": {
+      "settings": {},
+      "extensions": [
+        "ms-python.python",
+        "ms-python.vscode-pylance"
+      ]
+    }
+  },
+  "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
+  "postAttachCommand": {
+    "server": "streamlit run app.py --server.enableCORS false --server.enableXsrfProtection false"
+  },
+  "portsAttributes": {
+    "8501": {
+      "label": "Application",
+      "onAutoForward": "openPreview"
+    }
+  },
+  "forwardPorts": [
+    8501
+  ]
+}
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,7 @@ aiconfig.log
 file_structure.txt
 
 #readme
-readme.txt
+readme.txt
+
+# secrets
+secrets.toml
diff --git a/app.py b/app.py
@@ -1,8 +1,11 @@
+from openai import OpenAI
 from chat_rover import ChatRover
 import streamlit as st
 from github_scraper import GitHubScraper
 import time
 import random
+import os
+from dotenv import load_dotenv
 
 AVATAR_IMAGE = 'https://raw.githubusercontent.com/Marcozc19/RepoRover/main/images/rover3.png'
 USER_IMAGE = "https://raw.githubusercontent.com/Marcozc19/RepoRover/main/images/moon.png"
@@ -24,11 +27,30 @@
     "My creators built me during a Large Language Model hackathon in 2023."
 ]
 
+
+# Tests API Key
+def is_valid_key(api_key):
+    try:
+        client = OpenAI(api_key=api_key)
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo-1106",
+            messages=[{'role': 'user', 'content': "Hello, world!"}],
+            max_tokens=5
+        )
+        # print(response)
+        # print(response.choices)
+        # print('choices' in response)
+        return response.choices is not None
+    except Exception as e:
+        print(e)
+        return False
+
+
 # Updates rover based on URL
 def update_url(url):
-    gitHubScraper = GitHubScraper(url)
+    gitHubScraper = GitHubScraper(url, token=st.secrets["GITHUB_TOKEN"])
     st.session_state.repo_name = gitHubScraper.get_repo_name()
-    st.session_state.chat_rover = ChatRover(gitHubScraper)
+    st.session_state.chat_rover = ChatRover(gitHubScraper, st.session_state.api_key)
 
 
 # Get the Rover if it exists
@@ -39,6 +61,33 @@ def update_url(url):
 # Title for the app
 st.title("RepoRover")
 
+# Get API Key
+if 'api_key' not in st.session_state or st.session_state.api_key is None:
+    # use .env key if there is .env
+    load_dotenv()
+    if "OPENAI_API_KEY" in os.environ:
+        api_key = os.environ["OPENAI_API_KEY"]
+        if is_valid_key(api_key):
+            st.session_state.api_key = api_key
+            st.success("API Key loaded from .env file")
+        else:
+            st.error("Invalid API Key from .env file")
+    elif "OPENAI_API_KEY" in st.secrets:
+        api_key = st.secrets["OPENAI_API_KEY"]
+        if is_valid_key(api_key):
+            st.session_state.api_key = api_key
+            st.success("API Key loaded from secrets file")
+        else:
+            st.error("Invalid API Key from sectets file")
+    else:
+        api_key = st.text_input("Enter your OpenAI API key", type="password")
+        if st.button('Submit'):
+            if is_valid_key(api_key):
+                st.session_state.api_key = api_key
+                st.success("API Key accepted.")
+            else:
+                st.error("Invalid API Key.")
+
 # Input box
 repo_url = st.text_input("Enter a Repo URL")
 
@@ -47,7 +96,7 @@ def update_url(url):
     if repo_url:
         random_fact = random.choice(fun_facts)
         st.info(f"Fun Fact: {random_fact}")
-        with st.spinner(f"Analyzing repository terrain... Please wait..."):
+        with st.spinner("Analyzing repository terrain... Please wait..."):
             update_url(repo_url)
             st.session_state.messages = []
         st.success(f"New world discovered! Welcome to {st.session_state.repo_name}!")
@@ -70,7 +119,7 @@ def update_url(url):
     st.session_state.messages.append({"role": "user", "content": prompt})
 
     # start the spinner
-    spinner = st.spinner(f"Engaging in digital deep thought...")
+    spinner = st.spinner("Engaging in digital deep thought...")
     spinner.__enter__()
 
     first_chunk_received = False

diff --git a/chat_rover.py b/chat_rover.py
@@ -1,26 +1,20 @@
 from openai import OpenAI
-import os
-from dotenv import load_dotenv
 
 from langchain.text_splitter import CharacterTextSplitter
-from langchain.embeddings import OpenAIEmbeddings
-from langchain.vectorstores import FAISS
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_community.vectorstores import FAISS
 from langchain.schema.document import Document
-from langchain_community.chat_models import ChatOpenAI
-from langchain.text_splitter import CharacterTextSplitter
+from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
 
 import tiktoken
 
-# load env
-load_dotenv()
-
 
 class ChatRover():
 
-    def __init__(self, gitHubScraper):
-        self.api_key = os.getenv('OPENAI_API_KEY')
-        self.client = OpenAI(api_key=self.api_key)
+    def __init__(self, gitHubScraper, api_key):
+        self.api_key = api_key
+        self.client = OpenAI(api_key=api_key)
 
         self.gitHubScraper = gitHubScraper
 
@@ -50,7 +44,7 @@ def create_file_vector(self):
         print("Creating file vector...")
         split_data = [Document(page_content=file) for file in files]
 
-        embeddings = OpenAIEmbeddings()
+        embeddings = OpenAIEmbeddings(openai_api_key=self.api_key)
         vectorstore = FAISS.from_documents(split_data, embedding=embeddings)
         print("File vector complete!")
         return vectorstore
@@ -65,21 +59,17 @@ def create_readme_vector(self):
         text_splitter = CharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
         split_data = [Document(page_content=chunk) for chunk in text_splitter.split_text(data)]
 
-        embeddings = OpenAIEmbeddings()
+        embeddings = OpenAIEmbeddings(openai_api_key=self.api_key)
         vectorstore = FAISS.from_documents(split_data, embedding=embeddings)
         print("Readme vector complete!")
         return vectorstore
-    
+
     # Returns summary of file contents for a given file_path that is relevant 
     # to the user query
     def code_summary(self, file_path, query):
         custom_prompt = """
-        Provide a clear and concise summary on the code that you will be given as it relates to a user query. 
-        You should reference specific parts of the code. 
-        Be technical. Your summary will be used by another LLM to explain specific parts of the code. 
-        Focus on those parts that are most relevant to the user query, the user may ask for specific code snippets which you will provide.
-        Do not speak to or address the user. 
-        Limit your response to 200 words.
+        Provide a clear and concise summary on the code that you will be given as it relates to a user query. You should reference specific parts of the code. Be technical. Your summary will be used by another LLM to explain specific parts of the code. Focus on those parts that are most relevant to the user query, the user may ask for specific code snippets which you will provide.  Do not speak to or address the user. Limit your response to 200 words.
+
         Code: {code}
         User Query: {query}
         """
@@ -115,16 +105,7 @@ def retrieve_context(self, query):
             i += 1
 
         role_prompt = f"""
-            As 'RepoRover', you are a specialized AI expert on the '{self.repo}' repository. 
-            Your expertise includes detailed knowledge of the repository's structure, 
-            critical portions of the README, and summaries of key files based on user queries. 
-            You do not have to use the summaries of files if they are not relevant. 
-            If they are relevant, feel free to copy them verbatum or you may choose to extract 
-            parts of them to best answer the user. 
-            Below is the relevant file structure, selected README excerpts, and summaries of important files. 
-            Using this information, please provide precise answers to the following question, 
-            referencing specific files or sections when useful. 
-            You are responding directly to the user. Only address the user in your response.
+            As 'RepoRover', you are a specialized AI expert on the '{self.repo}' repository. Your expertise includes detailed knowledge of the repository's structure, critical portions of the README, and summaries of key files based on user queries. You do not have to use the summaries of files if they are not relevant. If they are relevant, feel free to copy them verbatum or you may choose to extract parts of them to best answer the user. Below is the relevant file structure, selected README excerpts, and summaries of important files. Using this information, please provide precise answers to the following question, referencing specific files or sections when useful. You may also provide code snippets and refer to functions in the files. You are responding directly to the user. Only address the user (which you should call '{self.repo} Explorer') in your response.
 
             README.md portion: '{readme_response}'
             Comma seperated file structure: '{file_response}'
@@ -172,5 +153,5 @@ def run_chat(self, user_input):
                 response_chunk = chunk.choices[0].delta.content
                 yield response_chunk
                 response += response_chunk
-                
-        self.update_history("assistant", response)
+
+        self.update_history("assistant", response)
diff --git a/github_scraper.py b/github_scraper.py
@@ -1,28 +1,31 @@
 import requests
 
-IGNORE_EXTS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg', '.mp4', '.mp3',
-               '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip', '.tar', '.gz', '.rar',
-               '.7z', '.exe', '.dll', '.jar', '.war', '.class']
+IGNORE_EXTS = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg', 'mp4', 'mp3',
+               'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', 'zip', 'tar', 'gz', 'rar',
+               '7z', 'exe', 'dll', 'jar', 'war', 'class']
 
 
 class GitHubScraper:
 
-    def __init__(self, github_url, branch=None, condensed=False):
+    def __init__(self, github_url, token=None, branch=None, condensed=False):
         self.github_url = github_url
+        self.token = token
+        self.header = {'Authorization': f'token {self.token}'} if self.token else {}
         self.owner, self.repo = self.get_github_repo_info()
         self.branch = self.get_default_branch() if branch is None else branch
 
+        self.file_contents = {}
         self.root_readme = ""
         self.file_paths = []
         self.set_files(condensed)
 
     # Getters
     def get_repo_name(self):
         return self.repo
-    
+
     def get_file_paths(self):
         return self.file_paths
-    
+
     def get_readme(self):
         return self.root_readme
 
@@ -34,7 +37,7 @@ def get_github_repo_info(self):
 
     def get_default_branch(self):
         url = f"https://api.github.com/repos/{self.owner}/{self.repo}"
-        response = requests.get(url)
+        response = requests.get(url, headers=self.header)
         if response.status_code == 200:
             data = response.json()
             return data.get('default_branch', 'master')
@@ -44,7 +47,7 @@ def get_default_branch(self):
 
     def set_files(self, condensed=False):
         url = f"https://api.github.com/repos/{self.owner}/{self.repo}/git/trees/{self.branch}?recursive=1"
-        response = requests.get(url)
+        response = requests.get(url, headers=self.header)
 
         if response.status_code == 200:
             data = response.json()
@@ -56,18 +59,21 @@ def set_files(self, condensed=False):
                     if file_extension not in IGNORE_EXTS:
                         if file_name == 'readme.md':
                             # must use correct casing to get file
-                            self.root_readme = self.get_file_raw(file['path'])  
+                            self.root_readme = self.get_file_raw(file['path'])
                         files.append(file['path'])
             if condensed:
                 files = self._condense_file_structure(files)
             self.file_paths = files
         else:
             print("Error:", response.status_code, response.text)
 
-
     def get_file_raw(self, file_path):
+        if file_path in self.file_contents:
+            return self.file_contents[file_path]
+
         url = f'https://api.github.com/repos/{self.owner}/{self.repo}/contents/{file_path}?ref={self.branch}'
-        headers = {'Accept': 'application/vnd.github.v3.raw'}
+        headers = self.header.copy()
+        headers['Accept'] = 'application/vnd.github.v3.raw'
 
         response = requests.get(url, headers=headers)
 
@@ -98,12 +104,3 @@ def _condense_file_structure(self, file_paths):
             formatted_structure += "  " * current_depth + path_segments[-1] + "\n"
 
         return formatted_structure
-
-
-if __name__ == "__main__":
-    # Replace with your GitHub URL
-    github_url = 'https://github.com/Stability-AI/generative-models'
-
-    scraper = GitHubScraper(github_url)
-    print(scraper.root_readme)
-    print(scraper.file_paths)
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,12 @@
 streamlit
-altair==4
+altair
 openai
 numpy
 tiktoken
-langchain
+langchain==0.0.353
+langchain_openai
+langchain_community==0.0.9
+langchain_core
+langsmith==0.0.77
+faiss-cpu
+python-dotenv