Skip to content
Open
14 changes: 12 additions & 2 deletions sdk/harambe/contrib/soup/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
# noinspection PyProtectedMember
from curl_cffi.requests import AsyncSession, HeaderTypes
from harambe.contrib.soup.tracing import Tracer
from harambe.contrib.types import AbstractElementHandle, AbstractPage, Selectable
from harambe.contrib.types import ResponseWithStatus
from harambe.contrib.types import (
AbstractElementHandle,
AbstractPage,
ResponseWithStatus,
Selectable,
)


class SoupElementHandle(AbstractElementHandle, Selectable["SoupElementHandle"]):
Expand Down Expand Up @@ -139,3 +143,9 @@ def __init__(self, selector: str, page: SoupPage) -> None:

async def all(self) -> list[SoupElementHandle]:
return await self._page.query_selector_all(self._selector)

async def text_content(self) -> str | None:
Comment thread
himankpathak marked this conversation as resolved.
if el := await self._page.query_selector(self._selector):
return await el.text_content()

return None
84 changes: 80 additions & 4 deletions sdk/harambe/core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import base64
import inspect
import tempfile
import uuid
Expand All @@ -22,10 +23,8 @@
from harambe.contrib.soup.impl import SoupPage
from harambe.contrib.types import AbstractPage
from harambe.cookie_utils import fix_cookie
from harambe.handlers import (
ResourceRequestHandler,
ResourceType,
)
from harambe.handlers import ResourceRequestHandler, ResourceType
from harambe.llm import LLM_AGENTS, LLMManager
from harambe.observer import (
DownloadMeta,
HTMLMetadata,
Expand All @@ -52,6 +51,7 @@
from harambe_core.errors import GotoError
from harambe_core.normalize_url import normalize_url
from harambe_core.parser.expression import ExpressionEvaluator
from harambe_core.types import SchemaFieldType
from playwright.async_api import (
ElementHandle,
Page,
Expand Down Expand Up @@ -608,5 +608,81 @@ async def wrapper(sdk: "SDK", url: URL, context: Context) -> None:

return decorator

@staticmethod
async def llm(
to_evaluate: Optional[ElementHandle | str] = None,
is_image_url: bool = False,
prompt: str = "You are a superintelligent artificial intelligence designed for helping software developers. Help evaluate the given text.",
data_type: SchemaFieldType = "string",
Comment thread
himankpathak marked this conversation as resolved.
include_screenshot: bool = False,
agent: Optional[LLM_AGENTS] = None,
model: Optional[str] = None,
return_object_format: Optional[object] = {},
) -> str:
"""
Use a LLM agent to evaluate any prompt for a string or ElementHandle or image URL.

Parameters:
to_evaluate (Optional[ElementHandle | str]): The ElementHandle or string or image URL to evaluate.
is_image_url (bool): Whether the to_evaluate is an image or not.
prompt (str): The prompt to use for the evaluation.
data_type (SchemaFieldType): The type of data to return.
include_screenshot (bool): Whether to include the screenshot of the element in the response (Playwright only)
agent (Optional[LLM_AGENTS]): The LLM agent to use.
model (Optional[str]): The model to use.
return_object_format (Optional[object]): The format to return the data in.
"""

agent = LLMManager(agent=agent, model=model)

stringify = to_evaluate
if not is_image_url:
if isinstance(to_evaluate, str):
stringify = to_evaluate.strip()

elif hasattr(to_evaluate, "text_content") and callable(
to_evaluate.text_content
):
stringify = await to_evaluate.text_content()
stringify = stringify.strip()

# Add a check to return None if no information is found
prompt += '. Just return the requested data without any additional text. If no information is found, return "NONE"'

# Return the response in the requested format
if len(return_object_format):
prompt += f". Return response in the following format: {{{str(return_object_format)}}}"

prompts = [
{"type": "text", "content": prompt},
{"type": "image" if is_image_url else "text", "content": stringify},
]

# Add the screenshot to prompt
if include_screenshot and not is_image_url:
screenshot = await to_evaluate.screenshot()
screenshot_b64 = base64.b64encode(screenshot).decode()

prompts.append(
{
"type": "image",
"content": f"data:image/jpeg;base64,{screenshot_b64}",
},
)

response = agent.query(prompts)

# Check if LLM failed in the response
if response == "NONE":
return None

if not len(return_object_format):
# Validate the response against data_type
schema = {"data": {"type": data_type}}
validator = SchemaParser(schema)
validator.validate({"data": response}, base_url="https://example.com")

return response


PAGE_PDF_FILENAME = "reworkd_page_pdf.pdf"
49 changes: 49 additions & 0 deletions sdk/harambe/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Dict, List, Literal, Optional

from openai import OpenAI

from harambe.settings import get_settings

LLM_AGENTS = Literal["openai"]


class LLMManager:
def __init__(self, agent: Optional[LLM_AGENTS] = None, model: Optional[str] = None):
self.agent_name = "openai" if agent is None else agent
self.model = "gpt-4o-mini" if model is None else model

if self.agent_name == "openai":
self.agent = OpenAI(api_key=get_settings().openai_api_key)

def query(self, prompts: List[Dict[str, str]]) -> str:
"""
Query the LLM agent with the given prompts.
Parameters:
prompts: List[{ type, content}]
type: one of ["text", "image"]
content: "string"

Returns:
response: The response from the LLM agent.
"""

if self.agent_name == "openai":
content = []
for prompt in prompts:
if prompt["type"] == "text":
content.append({"type": "text", "text": prompt["content"]})

if prompt["type"] == "image":
content.append(
{
"type": "image_url",
"image_url": {"url": f"{prompt['content']}"},
}
)

response = self.agent.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": content}],
)

return response.choices[0].message.content
14 changes: 14 additions & 0 deletions sdk/harambe/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from functools import lru_cache

# from pydantic import SecretStr
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: remove

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, I was planning to remove this.
Actually its recommended to use SecretStr but it masks it even when we actually need so, I was trying to figure it out.

from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=(".env"))
openai_api_key: str


@lru_cache()
def get_settings():
return Settings()
Comment on lines +12 to +14
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this from their examples?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, I got this from a blog.

3 changes: 3 additions & 0 deletions sdk/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ dependencies = [
"curl-cffi==0.7.3",
"ua-generator==1.0.5",
"python-slugify>=8.0.4",
"pydantic-settings>=2.6.1",
"openai>=1.56.2",
]

[tool.uv]
Expand All @@ -32,6 +34,7 @@ dev-dependencies = [
"pytest==7.4.4",
"pytest-cov==4.1.0",
"pytest-asyncio==0.21.2",
"rich>=13.9.4",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's this for?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used it on local for pretty printing on the console, I can remove it, its just a dev dep

]

[tool.uv.sources]
Expand Down
79 changes: 76 additions & 3 deletions sdk/test/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
import pytest
from aiohttp import web
from bs4 import BeautifulSoup
from harambe import SDK
from harambe.contrib import playwright_harness, soup_harness
from harambe.observer import InMemoryObserver
from harambe.types import BrowserType
from harambe_core.errors import GotoError

from harambe import SDK
from harambe.contrib import playwright_harness, soup_harness


@pytest.fixture(scope="module")
def mock_html_folder():
Expand Down Expand Up @@ -588,3 +587,77 @@ async def scrape(sdk: SDK, current_url, context) -> None:
)

assert len(observer.data) == 0


@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
async def test_core_llm_method(server, observer, harness):
Comment thread
himankpathak marked this conversation as resolved.
url = f"{server}/solicitation"

async def scrape(sdk: SDK, url, context) -> None:
page = sdk.page
# body = await page.query_selector("body")
body = page.locator("body")

date_prepared = await sdk.llm(
to_evaluate=body,
prompt="Find the date prepared in the general information",
data_type="datetime",
)
solicitation_id = await sdk.llm(
to_evaluate=body,
prompt="Find the solicitation id in the general information",
data_type="int",
)

large_text = """
<p> Here is a big lorem ipsum element Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc imperdiet,
libero ac vestibulum tristique, massa orci viverra augue, eu congue elit urna a justo. Suspendisse ac nisi
dolor. Sed turpis ante, tincidunt in nibh quis, pellentesque euismod dolor. Sed at mauris maximus, tempus
dolor eu, tristique sem. Fusce in dolor egestas, The product launch date was 12/04/2024 vulputate lacus eget, pharetra felis. Morbi ac lorem at
lorem aliquam blandit. Quisque eget vulputate felis. Suspendisse bibendum mauris vel ex dignissim tincidunt.
Integer porttitor libero ligula, ut convallis lorem rhoncus et. Ut ac nisl a mauris malesuada aliquet.
In vitae pharetra tellus. Suspendisse vel varius tellus. Ut pellentesque sem at gravida volutpat. </p>
"""
product_launch_date = await sdk.llm(
to_evaluate=large_text,
prompt="Find the product launch date on page",
data_type="datetime",
)

large_text_none = """
<p> Here is a big lorem ipsum element Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc imperdiet,
libero ac vestibulum tristique, massa orci viverra augue, eu congue elit urna a justo. Suspendisse ac nisi
dolor. Sed turpis ante, tincidunt in nibh quis, pellentesque euismod dolor. Sed at mauris maximus, tempus
dolor eu, tristique sem. Fusce in dolor egestas, The product launch date was vulputate lacus eget, pharetra felis. Morbi ac lorem at
lorem aliquam blandit. Quisque eget vulputate felis. Suspendisse bibendum mauris vel ex dignissim tincidunt.
Integer porttitor libero ligula, ut convallis lorem rhoncus et. Ut ac nisl a mauris malesuada aliquet.
In vitae pharetra tellus. Suspendisse vel varius tellus. Ut pellentesque sem at gravida volutpat. </p>
"""
product_launch_date_none = await sdk.llm(
to_evaluate=large_text_none,
prompt="Find the product launch date on page",
data_type="datetime",
)

await sdk.save_data(
{
"date_prepared": date_prepared,
"solicitation_id": solicitation_id,
"product_launch_date": product_launch_date,
"product_launch_date_none": product_launch_date_none,
}
)

await SDK.run(
scrape,
url,
schema={},
harness=harness,
context={"status": "Open"},
observer=observer,
)
assert len(observer.data) == 1
assert observer.data[0]["date_prepared"] == "10/31/24"
assert observer.data[0]["solicitation_id"] == "6100062375"
assert observer.data[0]["product_launch_date"] == "12/04/2024"
assert observer.data[0]["product_launch_date_none"] is None
Loading