Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ regex = "1.10.4"
ollama-rs = { version = "0.1.9", features = ["stream"] }
owo-colors = "4.0.0"
lazy_static = "1.4.0"


sqlx = { version = "0.6.0", features = ["runtime-tokio-rustls", "macros", "postgres"] }
bcrypt = "0.11.0"
39 changes: 38 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ This project aims to build a tool that can be run locally, is open-source, and d
- [x] very quick searching, scraping & answering due to parallelism
- [x] Configurable number of search results to parse
- [x] local scraping of websites
- [x] User account management with login and registration functionality
- [x] Search history saving for logged-in users

---

Expand All @@ -27,8 +29,20 @@ This project aims to build a tool that can be run locally, is open-source, and d
2. Get Bing API key
3. Get OpenAI API key or [Ollama](https://ollama.com/)
4. Fill/setup the environment variables (see `sample.env` file, copy it to `.fyin.env` and fill the values))
5. `cargo run --query "<Question>" -n <number of search results>`
5. Set up the database (see instructions below)
6. `cargo run --query "<Question>" -n <number of search results>`

### Setting up the Database

1. Install PostgreSQL and create a new database.
2. Set the `DATABASE_URL` environment variable to point to your database. For example:
```
export DATABASE_URL=postgres://user:password@localhost/fyin
```
3. Run the database migrations:
```
sqlx migrate run
```

### Environment Variables
```
Expand All @@ -50,6 +64,9 @@ EMBEDDING_MODEL_NAME="text-embedding-ada-002"

# CHAT_MODEL_NAME="llama3"
CHAT_MODEL_NAME="gpt-4o"

# Database URL
DATABASE_URL="your-database-url"
```

### Docker
Expand All @@ -60,6 +77,26 @@ Here is how you can run the app using docker:

`docker run --rm --env-file .env fyin --query "<your question>" --search <optional: number of search results to parse>`

## Using Accounts and Saving Searches

### Registering a New Account
To register a new account, use the `register` command:
```
cargo run -- register --username <your-username> --password <your-password>
```

### Logging In
To log in to your account, use the `login` command:
```
cargo run -- login --username <your-username> --password <your-password>
```

### Saving Searches
When you are logged in, your searches will be automatically saved to your account. You can view your search history using the `history` command:
```
cargo run -- history --username <your-username>
```

## Notes
- The app use Bing API for searching. You can get from [Active Bing API](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api).
- You can get OpenAI API key form [OpenAI](https://openai.com/api/).
Expand Down
8 changes: 8 additions & 0 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,12 @@ pub struct Args {
/// Number of search results to parse
#[arg(short, long, default_value_t = 10)]
pub search: usize,

/// Register a new user
#[arg(long)]
pub register: Option<String>,

/// Login as an existing user
#[arg(long)]
pub login: Option<String>,
}
13 changes: 13 additions & 0 deletions src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ pub struct Chunk {
pub url: String,
}

pub struct User {
pub id: i32,
pub username: String,
pub password_hash: String,
}

pub struct SearchHistory {
pub id: i32,
pub user_id: i32,
pub query: String,
pub timestamp: chrono::NaiveDateTime,
}

pub fn hash_string(input: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(input.as_bytes());
Expand Down
62 changes: 62 additions & 0 deletions src/db.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
use sqlx::{Pool, Postgres, postgres::PgPoolOptions};
use anyhow::Result;
use bcrypt::{hash, verify};
use chrono::Utc;

use crate::data::{User, SearchHistory};

pub async fn establish_connection() -> Result<Pool<Postgres>> {
let database_url = std::env::var("DATABASE_URL")?;
let pool = PgPoolOptions::new()
.max_connections(5)
.connect(&database_url)
.await?;
Ok(pool)
}

pub async fn register_user(username: &str, password: &str) -> Result<()> {
let pool = establish_connection().await?;
let password_hash = hash(password, 4)?;
sqlx::query!(
"INSERT INTO users (username, password_hash) VALUES ($1, $2)",
username,
password_hash
)
.execute(&pool)
.await?;
Ok(())
}

pub async fn login_user(username: &str, password: &str) -> Result<bool> {
let pool = establish_connection().await?;
let user = sqlx::query_as!(
User,
"SELECT id, username, password_hash FROM users WHERE username = $1",
username
)
.fetch_one(&pool)
.await?;

Ok(verify(password, &user.password_hash)?)
}

pub async fn save_search_history(username: &str, query: &str) -> Result<()> {
let pool = establish_connection().await?;
let user = sqlx::query_as!(
User,
"SELECT id, username, password_hash FROM users WHERE username = $1",
username
)
.fetch_one(&pool)
.await?;

sqlx::query!(
"INSERT INTO search_history (user_id, query, timestamp) VALUES ($1, $2, $3)",
user.id,
query,
Utc::now().naive_utc()
)
.execute(&pool)
.await?;
Ok(())
}
46 changes: 26 additions & 20 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
// #![allow(unused_variables)]
#![allow(dead_code)]
// #![allow(unused_imports)]
// #![allow(deprecated)]

#[macro_use]
extern crate lazy_static;
Expand All @@ -14,6 +11,7 @@ mod llm;
mod pretty_print;
mod scraper;
mod vector;
mod db;

use anyhow::Result;
use clap::Parser;
Expand All @@ -25,15 +23,14 @@ use dotenv;
use std::env;

async fn init() -> Result<()> {
// load ENV variables
dotenv::dotenv().ok();

// verify required ones are present
let env_vars = [
"OPENAI_API_KEY",
"BING_SUBSCRIPTION_KEY",
"EMBEDDING_MODEL_NAME",
"CHAT_MODEL_NAME",
"DATABASE_URL",
];

for &var_name in &env_vars {
Expand All @@ -56,58 +53,67 @@ async fn main() -> Result<()> {
init().await?;
let args = args::Args::parse();

prompt(&args.query, args.search).await?;
if let Some(username) = args.register {
let password = rpassword::prompt_password("Password: ").unwrap();
db::register_user(&username, &password).await?;
println!("User registered successfully.");
return Ok(());
}

if let Some(username) = args.login {
let password = rpassword::prompt_password("Password: ").unwrap();
if db::login_user(&username, &password).await? {
println!("Login successful.");
prompt(&args.query, args.search, Some(username)).await?;
} else {
println!("Invalid username or password.");
}
return Ok(());
}

prompt(&args.query, args.search, None).await?;
Ok(())
}

async fn prompt(prompt: &str, search_count: usize) -> Result<()> {
async fn prompt(prompt: &str, search_count: usize, username: Option<String>) -> Result<()> {
pretty_print::print_blue(&format!("Searching for: {}", prompt));
let request = data::Request::init(prompt);
let llm_agent = llm::LlmAgent::init().await;

// do a test embed and figure out dimension

let dimension = llm_agent.embed_string(prompt).await.unwrap().len();

// create a new vector client
let vector_client = Arc::new(sync::Mutex::new(
vector::VectorDB::init(Some(dimension)).await?,
));

// fetch search results
pretty_print::print_blue("Fetching search results from bing...");
bing::fetch_web_pages(request.clone(), search_count).await?;

// scrape content
pretty_print::print_blue("Scraping content from search results...");
scraper::process_urls(request.clone()).await?;

// do embedding on all the scrapped contents.
// store in vector DB
pretty_print::print_blue("Embedding content...");
embedding::generate_upsert_embeddings(request.clone(), vector_client.clone()).await?;

// convert prompt to embedding
let prompt_embedding = llm_agent.embed_string(prompt).await?;

// build vector index
vector_client.lock().await.build_index().await?;

// search across embedding
// and get all embedding ids
let ids = vector_client
.lock()
.await
.search(&prompt_embedding, 10)
.await?;

// get content
let chunks: Vec<data::Chunk> = request.lock().unwrap().get_chunks(ids);

let llm_agent = llm::LlmAgent::init().await;
llm_agent.answer_question_stream(prompt, &chunks).await?;

//clean-up vector DB
if let Some(username) = username {
db::save_search_history(&username, prompt).await?;
}

vector_client.lock().await.clean_up().await?;

Ok(())
Expand Down