Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
20 changes: 15 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,18 @@ bin/
# Documentation
.docs/

# ADK
.adk/
go/agents/sail-researcher/researcher
go/agents/sail-researcher/vendor/*
go/agents/sail-researcher/server
# Audit Artifacts
evidence_lake/
evidence_lake.json
cockpit_report.html
cockpit_audit.sarif
fleet_dashboard.html
optimization_report_*.html

# Recursive folders
python/agents/agent-optimizer/
python/agents/agent-optimizer/adk-samples/
python/agents/agent-optimizer/adk-python/
python/agents/agent-optimizer/cloned_repo/


9 changes: 3 additions & 6 deletions python/agents/RAG/rag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,9 @@
# limitations under the License.

import os

import google.auth

from . import agent

_, project_id = google.auth.default()
os.environ.setdefault("GOOGLE_CLOUD_PROJECT", project_id)
os.environ["GOOGLE_CLOUD_LOCATION"] = "global"
os.environ.setdefault("GOOGLE_GENAI_USE_VERTEXAI", "True")
os.environ.setdefault('GOOGLE_CLOUD_PROJECT', project_id)
os.environ['GOOGLE_CLOUD_LOCATION'] = 'global'
os.environ.setdefault('GOOGLE_GENAI_USE_VERTEXAI', 'True')
37 changes: 3 additions & 34 deletions python/agents/RAG/rag/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,46 +14,15 @@

import os
import uuid

from dotenv import load_dotenv
from google.adk.agents import Agent
from google.adk.tools.retrieval.vertex_ai_rag_retrieval import (
VertexAiRagRetrieval,
)
from google.adk.tools.retrieval.vertex_ai_rag_retrieval import VertexAiRagRetrieval
from openinference.instrumentation import using_session
from vertexai.preview import rag

from rag.tracing import instrument_adk_with_arize

from .prompts import return_instructions_root

load_dotenv()
_ = instrument_adk_with_arize()


ask_vertex_retrieval = VertexAiRagRetrieval(
name="retrieve_rag_documentation",
description=(
"Use this tool to retrieve documentation and reference materials for the question from the RAG corpus,"
),
rag_resources=[
rag.RagResource(
# please fill in your own rag corpus
# here is a sample rag corpus for testing purpose
# e.g. projects/123/locations/us-central1/ragCorpora/456
rag_corpus=os.environ.get("RAG_CORPUS")
)
],
similarity_top_k=10,
vector_distance_threshold=0.6,
)

ask_vertex_retrieval = VertexAiRagRetrieval(name='retrieve_rag_documentation', description='Use this tool to retrieve documentation and reference materials for the question from the RAG corpus,', rag_resources=[rag.RagResource(rag_corpus=os.environ.get('RAG_CORPUS'))], similarity_top_k=10, vector_distance_threshold=0.6)
with using_session(session_id=uuid.uuid4()):
root_agent = Agent(
model="gemini-2.0-flash-001",
name="ask_rag_agent",
instruction=return_instructions_root(),
tools=[
ask_vertex_retrieval,
],
)
root_agent = Agent(model='gemini-2.0-flash-001', name='ask_rag_agent', instruction=return_instructions_root(), tools=[ask_vertex_retrieval])
94 changes: 5 additions & 89 deletions python/agents/RAG/rag/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,94 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""Module for storing and retrieving agent instructions.

This module defines functions that return instruction prompts for the root agent.
These instructions guide the agent's behavior, workflow, and tool usage.
"""

from google.adk.agents.context_cache_config import ContextCacheConfig
"Module for storing and retrieving agent instructions.\n\nThis module defines functions that return instruction prompts for the root agent.\nThese instructions guide the agent's behavior, workflow, and tool usage.\n"

def return_instructions_root() -> str:
instruction_prompt_v1 = """
You are an AI assistant with access to specialized corpus of documents.
Your role is to provide accurate and concise answers to questions based
on documents that are retrievable using ask_vertex_retrieval. If you believe
the user is just chatting and having casual conversation, don't use the retrieval tool.

But if the user is asking a specific question about a knowledge they expect you to have,
you can use the retrieval tool to fetch the most relevant information.

If you are not certain about the user intent, make sure to ask clarifying questions
before answering. Once you have the information you need, you can use the retrieval tool
If you cannot provide an answer, clearly explain why.

Do not answer questions that are not related to the corpus.
When crafting your answer, you may use the retrieval tool to fetch details
from the corpus. Make sure to cite the source of the information.

Citation Format Instructions:

When you provide an answer, you must also add one or more citations **at the end** of
your answer. If your answer is derived from only one retrieved chunk,
include exactly one citation. If your answer uses multiple chunks
from different files, provide multiple citations. If two or more
chunks came from the same file, cite that file only once.

**How to cite:**
- Use the retrieved chunk's `title` to reconstruct the reference.
- Include the document title and section if available.
- For web resources, include the full URL when available.

Format the citations at the end of your answer under a heading like
"Citations" or "References." For example:
"Citations:
1) RAG Guide: Implementation Best Practices
2) Advanced Retrieval Techniques: Vector Search Methods"

Do not reveal your internal chain-of-thought or how you used the chunks.
Simply provide concise and factual answers, and then list the
relevant citation(s) at the end. If you are not certain or the
information is not available, clearly state that you do not have
enough information.
"""

_instruction_prompt_v0 = """
You are a Documentation Assistant. Your role is to provide accurate and concise
answers to questions based on documents that are retrievable using ask_vertex_retrieval. If you believe
the user is just discussing, don't use the retrieval tool. But if the user is asking a question and you are
uncertain about a query, ask clarifying questions; if you cannot
provide an answer, clearly explain why.

When crafting your answer,
you may use the retrieval tool to fetch code references or additional
details. Citation Format Instructions:

When you provide an
answer, you must also add one or more citations **at the end** of
your answer. If your answer is derived from only one retrieved chunk,
include exactly one citation. If your answer uses multiple chunks
from different files, provide multiple citations. If two or more
chunks came from the same file, cite that file only once.

**How to
cite:**
- Use the retrieved chunk's `title` to reconstruct the
reference.
- Include the document title and section if available.
- For web resources, include the full URL when available.

Format the citations at the end of your answer under a heading like
"Citations" or "References." For example:
"Citations:
1) RAG Guide: Implementation Best Practices
2) Advanced Retrieval Techniques: Vector Search Methods"

Do not
reveal your internal chain-of-thought or how you used the chunks.
Simply provide concise and factual answers, and then list the
relevant citation(s) at the end. If you are not certain or the
information is not available, clearly state that you do not have
enough information.
"""

return instruction_prompt_v1
instruction_prompt_v1 = '\n You are an AI assistant with access to specialized corpus of documents.\n Your role is to provide accurate and concise answers to questions based\n on documents that are retrievable using ask_vertex_retrieval. If you believe\n the user is just chatting and having casual conversation, don\'t use the retrieval tool.\n\n But if the user is asking a specific question about a knowledge they expect you to have,\n you can use the retrieval tool to fetch the most relevant information.\n \n If you are not certain about the user intent, make sure to ask clarifying questions\n before answering. Once you have the information you need, you can use the retrieval tool\n If you cannot provide an answer, clearly explain why.\n\n Do not answer questions that are not related to the corpus.\n When crafting your answer, you may use the retrieval tool to fetch details\n from the corpus. Make sure to cite the source of the information.\n \n Citation Format Instructions:\n \n When you provide an answer, you must also add one or more citations **at the end** of\n your answer. If your answer is derived from only one retrieved chunk,\n include exactly one citation. If your answer uses multiple chunks\n from different files, provide multiple citations. If two or more\n chunks came from the same file, cite that file only once.\n\n **How to cite:**\n - Use the retrieved chunk\'s `title` to reconstruct the reference.\n - Include the document title and section if available.\n - For web resources, include the full URL when available.\n \n Format the citations at the end of your answer under a heading like\n "Citations" or "References." For example:\n "Citations:\n 1) RAG Guide: Implementation Best Practices\n 2) Advanced Retrieval Techniques: Vector Search Methods"\n\n Do not reveal your internal chain-of-thought or how you used the chunks.\n Simply provide concise and factual answers, and then list the\n relevant citation(s) at the end. If you are not certain or the\n information is not available, clearly state that you do not have\n enough information.\n '
_instruction_prompt_v0 = '\n You are a Documentation Assistant. Your role is to provide accurate and concise\n answers to questions based on documents that are retrievable using ask_vertex_retrieval. If you believe\n the user is just discussing, don\'t use the retrieval tool. But if the user is asking a question and you are\n uncertain about a query, ask clarifying questions; if you cannot\n provide an answer, clearly explain why.\n\n When crafting your answer,\n you may use the retrieval tool to fetch code references or additional\n details. Citation Format Instructions:\n \n When you provide an\n answer, you must also add one or more citations **at the end** of\n your answer. If your answer is derived from only one retrieved chunk,\n include exactly one citation. If your answer uses multiple chunks\n from different files, provide multiple citations. If two or more\n chunks came from the same file, cite that file only once.\n\n **How to\n cite:**\n - Use the retrieved chunk\'s `title` to reconstruct the\n reference.\n - Include the document title and section if available.\n - For web resources, include the full URL when available.\n \n Format the citations at the end of your answer under a heading like\n "Citations" or "References." For example:\n "Citations:\n 1) RAG Guide: Implementation Best Practices\n 2) Advanced Retrieval Techniques: Vector Search Methods"\n\n Do not\n reveal your internal chain-of-thought or how you used the chunks.\n Simply provide concise and factual answers, and then list the\n relevant citation(s) at the end. If you are not certain or the\n information is not available, clearly state that you do not have\n enough information.\n '
return instruction_prompt_v1
124 changes: 34 additions & 90 deletions python/agents/RAG/rag/shared_libraries/prepare_corpus_and_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,155 +12,99 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from tenacity import retry, wait_exponential, stop_after_attempt
from tenacity import retry, wait_exponential, stop_after_attempt
import os
import tempfile

import requests
import vertexai
from dotenv import load_dotenv, set_key
from google.api_core.exceptions import ResourceExhausted
from google.auth import default
from vertexai.preview import rag

# Load environment variables from .env file
load_dotenv()

# --- Please fill in your configurations ---
# Retrieve the PROJECT_ID from the environmental variables.
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
PROJECT_ID = os.getenv('GOOGLE_CLOUD_PROJECT')
if not PROJECT_ID:
raise ValueError(
"GOOGLE_CLOUD_PROJECT environment variable not set. Please set it in your .env file."
)
LOCATION = os.getenv("GOOGLE_CLOUD_LOCATION")
raise ValueError('GOOGLE_CLOUD_PROJECT environment variable not set. Please set it in your .env file.')
LOCATION = os.getenv('GOOGLE_CLOUD_LOCATION')
if not LOCATION:
raise ValueError(
"GOOGLE_CLOUD_LOCATION environment variable not set. Please set it in your .env file."
)
CORPUS_DISPLAY_NAME = "Alphabet_10K_2024_corpus"
raise ValueError('GOOGLE_CLOUD_LOCATION environment variable not set. Please set it in your .env file.')
CORPUS_DISPLAY_NAME = 'Alphabet_10K_2024_corpus'
CORPUS_DESCRIPTION = "Corpus containing Alphabet's 10-K 2024 document"
PDF_URL = "https://abc.xyz/assets/77/51/9841ad5c4fbe85b4440c47a4df8d/goog-10-k-2024.pdf"
PDF_FILENAME = "goog-10-k-2024.pdf"
ENV_FILE_PATH = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "..", ".env")
)
PDF_URL = 'https://abc.xyz/assets/77/51/9841ad5c4fbe85b4440c47a4df8d/goog-10-k-2024.pdf'
PDF_FILENAME = 'goog-10-k-2024.pdf'
ENV_FILE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '.env'))


# --- Start of the script ---
def initialize_vertex_ai():
credentials, _ = default()
vertexai.init(
project=PROJECT_ID, location=LOCATION, credentials=credentials
)

vertexai.init(project=PROJECT_ID, location=LOCATION, credentials=credentials)

def create_or_get_corpus():
"""Creates a new corpus or retrieves an existing one."""
embedding_model_config = rag.EmbeddingModelConfig(
publisher_model="publishers/google/models/text-embedding-004"
)
embedding_model_config = rag.EmbeddingModelConfig(publisher_model='publishers/google/models/text-embedding-004')
existing_corpora = rag.list_corpora()
corpus = None
for existing_corpus in existing_corpora:
if existing_corpus.display_name == CORPUS_DISPLAY_NAME:
corpus = existing_corpus
print(
f"Found existing corpus with display name '{CORPUS_DISPLAY_NAME}'"
)
print(f"Found existing corpus with display name '{CORPUS_DISPLAY_NAME}'")
break
if corpus is None:
corpus = rag.create_corpus(
display_name=CORPUS_DISPLAY_NAME,
description=CORPUS_DESCRIPTION,
embedding_model_config=embedding_model_config,
)
corpus = rag.create_corpus(display_name=CORPUS_DISPLAY_NAME, description=CORPUS_DESCRIPTION, embedding_model_config=embedding_model_config)
print(f"Created new corpus with display name '{CORPUS_DISPLAY_NAME}'")
return corpus


def download_pdf_from_url(url, output_path):
"""Downloads a PDF file from the specified URL."""
print(f"Downloading PDF from {url}...")
print(f'Downloading PDF from {url}...')
response = requests.get(url, stream=True)
response.raise_for_status() # Raise an exception for HTTP errors

with open(output_path, "wb") as f:
response.raise_for_status()
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)

print(f"PDF downloaded successfully to {output_path}")
print(f'PDF downloaded successfully to {output_path}')
return output_path


def upload_pdf_to_corpus(corpus_name, pdf_path, display_name, description):
"""Uploads a PDF file to the specified corpus."""
print(f"Uploading {display_name} to corpus...")
print(f'Uploading {display_name} to corpus...')
try:
rag_file = rag.upload_file(
corpus_name=corpus_name,
path=pdf_path,
display_name=display_name,
description=description,
)
print(f"Successfully uploaded {display_name} to corpus")
rag_file = rag.upload_file(corpus_name=corpus_name, path=pdf_path, display_name=display_name, description=description)
print(f'Successfully uploaded {display_name} to corpus')
return rag_file
except ResourceExhausted as e:
print(f"Error uploading file {display_name}: {e}")
print(
"\nThis error suggests that you have exceeded the API quota for the embedding model."
)
print("This is common for new Google Cloud projects.")
print(
"Please see the 'Troubleshooting' section in the README.md for instructions on how to request a quota increase."
)
print(f'Error uploading file {display_name}: {e}')
print('\nThis error suggests that you have exceeded the API quota for the embedding model.')
print('This is common for new Google Cloud projects.')
print("Please see the 'Troubleshooting' section in the README.md for instructions on how to request a quota increase.")
return None
except Exception as e:
print(f"Error uploading file {display_name}: {e}")
print(f'Error uploading file {display_name}: {e}')
return None


def update_env_file(corpus_name, env_file_path):
"""Updates the .env file with the corpus name."""
try:
set_key(env_file_path, "RAG_CORPUS", corpus_name)
print(f"Updated RAG_CORPUS in {env_file_path} to {corpus_name}")
set_key(env_file_path, 'RAG_CORPUS', corpus_name)
print(f'Updated RAG_CORPUS in {env_file_path} to {corpus_name}')
except Exception as e:
print(f"Error updating .env file: {e}")

print(f'Error updating .env file: {e}')

def list_corpus_files(corpus_name):
"""Lists files in the specified corpus."""
files = list(rag.list_files(corpus_name=corpus_name))
print(f"Total files in corpus: {len(files)}")
print(f'Total files in corpus: {len(files)}')
for file in files:
print(f"File: {file.display_name} - {file.name}")

print(f'File: {file.display_name} - {file.name}')

def main():
initialize_vertex_ai()
corpus = create_or_get_corpus()

# Update the .env file with the corpus name
update_env_file(corpus.name, ENV_FILE_PATH)

# Create a temporary directory to store the downloaded PDF
with tempfile.TemporaryDirectory() as temp_dir:
pdf_path = os.path.join(temp_dir, PDF_FILENAME)

# Download the PDF from the URL
download_pdf_from_url(PDF_URL, pdf_path)

# Upload the PDF to the corpus
upload_pdf_to_corpus(
corpus_name=corpus.name,
pdf_path=pdf_path,
display_name=PDF_FILENAME,
description="Alphabet's 10-K 2024 document",
)

# List all files in the corpus
upload_pdf_to_corpus(corpus_name=corpus.name, pdf_path=pdf_path, display_name=PDF_FILENAME, description="Alphabet's 10-K 2024 document")
list_corpus_files(corpus_name=corpus.name)


if __name__ == "__main__":
main()
if __name__ == '__main__':
main()
Loading