diff --git a/app/.env.example b/app/.env.example new file mode 100644 index 0000000..8805dc2 --- /dev/null +++ b/app/.env.example @@ -0,0 +1,26 @@ +# Card Storage Configuration +# Copy this file to .env.local and configure your storage backend + +# ============================================================================ +# Storage Provider +# ============================================================================ +# Currently using in-memory storage for development +# Future: Can be extended to support DynamoDB or PostgreSQL + +# ============================================================================ +# Authentication Configuration +# ============================================================================ +# TODO: Configure your authentication provider +# Example with NextAuth.js: +# NEXTAUTH_URL=http://localhost:3000 +# NEXTAUTH_SECRET=your_secret_here + +# ============================================================================ +# Development Settings +# ============================================================================ + +# Enable verbose logging for card operations +DEBUG_CARD_STORAGE=false + +# Maximum cards per user (optional limit) +MAX_CARDS_PER_USER=10000 diff --git a/app/.gitignore b/app/.gitignore new file mode 100644 index 0000000..6923908 --- /dev/null +++ b/app/.gitignore @@ -0,0 +1,48 @@ +# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. + +# dependencies +/node_modules +/.pnp +.pnp.js +.yarn/install-state.gz + +# testing +/coverage + +# next.js +/.next/ +/out/ + +# production +/build + +# misc +.DS_Store +*.pem + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# local env files +.env*.local +.env + +# vercel +.vercel + +# typescript +*.tsbuildinfo +next-env.d.ts + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db diff --git a/baseCard.json b/baseCard.json new file mode 100644 index 0000000..78bd59b --- /dev/null +++ b/baseCard.json @@ -0,0 +1,93 @@ +{ + "cardId": "string", + "language": "string", + "lemma": "string", + "normalizedLemma": "string", + "partOfSpeech": "string", + "otherForms": [ + { + "form": "string", + "grammaticalInfo": "string" + } + ], + + "phonetics": { + "ipa": "string", + "respelling": "string" + }, + + "definitions": [ + { + "definition": "string", + "register": "neutral | formal | informal | slang | technical", + "domain": "string", + "confidence": 0.0 + } + ], + + "coreMeaning": "string", + + "examples": [ + { + "sentence": "string", + "highlightedLemma": "string", + "difficulty": "easy | medium | hard", + "contextTag": "daily | academic | professional | literary", + "sourceType": "constructed | corpus-inspired" + } + ], + + "collocations": [ + { + "phrase": "string", + "pattern": "string" + } + ], + + "synonyms": [ + { + "word": "string", + "nuance": "string" + } + ], + + "antonyms": ["string"], + + "usageNotes": [ + { + "note": "string", + "commonMistake": true + } + ], + + "semanticRelations": { + "hypernyms": ["string"], + "hyponyms": ["string"], + "relatedConcepts": ["string"] + }, + + "etymology": { + "origin": "string", + "evolution": "string" + }, + + "voiceout": { + "ttsText": "string", + "slowTtsText": "string", + "pronunciationHint": "string" + }, + + "learningAids": { + "mnemonic": "string", + "visualCue": "string" + }, + + "aiGrounding": { + "allowedScope": "Only answer questions using this card's data and general linguistic knowledge.", + "ambiguityNotes": "string" + }, + + "version": "1.0", + "createdBy": "ai | human", + "qualityScore": 0.0 +} diff --git a/docs/mvp-v1-tech-stack.md b/docs/mvp-v1-tech-stack.md new file mode 100644 index 0000000..9cde501 --- /dev/null +++ b/docs/mvp-v1-tech-stack.md @@ -0,0 +1,97 @@ +# LingoLM MVP v1 Tech Stack (Recommended) + +## Goal +Implement MVP v1 in the cheapest and simplest way possible: +- Single-word lookup -> auto-populated card -> ask questions about nuance -> add notes -> save +- Minimal infrastructure, minimal operational overhead + +## Non-Goals (v2+) +- Article ingestion and vocabulary extraction +- Full-text search across user notes +- Tags, linking, spaced repetition system (SRS), export +- Embeddings, vector search, “RAG” over a corpus + +## Architecture Summary +- Client: Web app (responsive) +- Auth: Amazon Cognito with Google IdP +- API: Amazon API Gateway (HTTP API) + AWS Lambda +- Data: DynamoDB only (WordCache + UserCards) +- LLM: Amazon Bedrock (structured JSON generation for base cards; chat for nuance Q&A) +- Observability: CloudWatch Logs + basic metrics + +## Frontend +### Choice +- Next.js web app (responsive) +### Hosting +- Vercel (fastest iteration) + +## Backend +- API Gateway (HTTP API) + Lambda +- API Gateway uses a Cognito JWT authorizer for protected routes +### Lambda functions (recommended) +- GET /lookup?lang=&lemma= +- POST /cards (create/update user card) +- GET /cards (list user cards) +- POST /chat (nuance Q&A for a word/card) + +### Core behaviors +- Lookup uses lazy caching: + - WordCache hit: return cached base card JSON + - WordCache miss: call Bedrock -> store base card -> return +- Stores a user-owned copy (UserCards) that can diverge from the base card +- Chat calls Bedrock with (base card + user edits + notes + question) and returns an answer + +## Data Storage +### DynamoDB tables +1) WordCache (global) +- Partition key: PK = LANG#{lang} +- Sort key: SK = LEMMA#{lemma} +- Attributes: + - baseCard (Map) + - generatedAt (ISO string) + - modelId (string) + - promptVersion (string) + - schemaVersion (string) + +2) UserCards (per-user) +- Partition key: PK = USER#{userId} +- Sort key: SK = CARD#{lang}#{lemma}#{cardId} +- Attributes: + - lang (string) + - lemma (string) + - card (Map) # user-editable structured fields + - notes (string) + - createdAt (ISO string) + - updatedAt (ISO string) + - baseRef: + - cachePK (string) + - cacheSK (string) + - schemaVersion (string) + - promptVersion (string) + +### Notes on schema +- Store card bodies as DynamoDB Map types (not stringified JSON) +- Version fields allow safe migrations and gradual regeneration of cached cards + +## Bedrock Usage (No RAG in v1) +### Base card generation +- Bedrock generates a structured “base card” JSON for a lemma using a strict schema and deterministic prompt +- No embeddings, no vector database, no retrieval pipeline + +### Nuance Q&A +- Bedrock answers user questions using: + - base card + - user edits + - user notes + +## Cost/Simplicity Principles +- Avoid always-on databases (no RDS/Postgres for v1) +- Avoid embeddings/vector search until there is a real corpus and a clear retrieval need +- Keep a single backend deployment model (API Gateway + Lambda) +- Use DynamoDB as the only persistent store in v1 + +## v2 Roadmap Hooks +- Article ingestion pipeline: paste article -> extract candidate words -> batch card generation +- Search/tags/linking/SRS: add secondary indexes and/or a dedicated search service later +- True RAG: only after choosing a grounded corpus (user-provided texts or curated examples) and defining retrieval objectives +- Store chat history per card later \ No newline at end of file diff --git a/docs/pull_request_template.md b/docs/pull_request_template.md new file mode 100644 index 0000000..e706ee4 --- /dev/null +++ b/docs/pull_request_template.md @@ -0,0 +1,26 @@ +## Description +Describe the changes you made and why. + +## Related issue(s) +Link the issue(s) that this PR addresses. + +## Type of change +- [ ] Bug fix +- [ ] New feature +- [ ] Breaking change +- [ ] Documentation update + +## How did you test this? +Describe how you tested these changes. + +## Checklist +- [ ] I have performed a self-review of my own code. +- [ ] I have commented my code, particularly in hard-to-understand areas. +- [ ] I have made corresponding changes to the documentation. +- [ ] New and existing unit tests pass locally with my changes. + +## GenAI usage +- What model(s) did you use? +- Percent of AI-written code? +- Any drawbacks? +- (Optional) Paste your main prompts here for feedback on better prompting techniques for efficiency. \ No newline at end of file diff --git a/docs/system-design.png b/docs/system-design.png new file mode 100644 index 0000000..fc3925c Binary files /dev/null and b/docs/system-design.png differ diff --git a/docs/tech-stack.md b/docs/tech-stack.md deleted file mode 100644 index c7ea0c0..0000000 --- a/docs/tech-stack.md +++ /dev/null @@ -1,81 +0,0 @@ -# LingoLM Tech Stack - -Refer to the [LingoLM Master Planning Doc](master-doc.md) for context on the project goals and features, some of the information is reiterated below for ease of technology selection. - -## Definitions -- Note/Card: Document for learning materials, current vocabulary, and other materials. The main data model for this feature is defined in the [Auto-Populated Cards](#auto-populated-cards). Users can edit the card as desired. - -## Core Use Cases -Single word lookup: Search word → pre-populated card appears → ask LLM questions → add personal notes → save (2 minutes vs 15 minutes manual) - -## Key Features - -### Auto-Populated Cards -This is the data model for a vocabulary card that users engage with for learning. Users will receive a card with a predefined template to be saved to their documents and edit them like Google Docs, and a DynamoDB table with user info stores the documents in a JSON representation below. - -interface document { - documentId: string (primary key) - word of interest: vocabulary - template_type: string - body: string (stringified JSON) -} - -Since this schema needs to be flexible, DynamoDB is a good option to store this data in a NoSQL fashion with quick retrieval. The cards will have pre-defined templates and will be "hydrated" at runtime with [on-demand RAG](#llm-rag). - -### LLM RAG -An initial setup will populate 5K most common vocabulary words as part of a global cache, which is done by RAG. This is stored in a PostgreSQL + pgvector vector database. When the user requests a word that isn't in the 5K word global cache, on-demand RAG will be performed on that word (different function). After the data is retrieved, a templating function is called to create the template for structuring the note. Prototype this starting with one foreign language. Run this multiple times to see how stochastic the method is, then fine tuned the Bedrock embeddings. The schemas for the vocabulary and contexts are below: - -CREATE TABLE words ( - word_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - lemma TEXT NOT NULL, -- e.g., 'venir' - lang TEXT NOT NULL, -- ISO 639-1 ('es','fr',...) - definition TEXT, - pos TEXT, -- part of speech - freq_rank INTEGER, -- lower = more common - surface_forms TEXT[] DEFAULT '{}', -- ['venir','vine','vengo',...] - CONSTRAINT words_lang_lemma_uniq UNIQUE (lang, lemma), - CONSTRAINT words_lang_chk CHECK (lang ~ '^[a-z]{2}$') -); - -CREATE TABLE contexts ( - context_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - word_id UUID NOT NULL REFERENCES words(word_id) ON DELETE CASCADE, - text TEXT, -- short context sentence - used_form TEXT, -- surface form in this context (e.g., 'vine') - cefr cefr_level, - topic TEXT[] DEFAULT '{}', -- ['travel','past-tense'] - embedding VECTOR(768) NOT NULL, -- <— dimension must match your model -); - -## Overall System - -### Frontend -Hosted on Vercel, React Native frontend - -### Backend -- MVP V1: Lambda for serverless API calls -- MVP V2: Lambda and Flask & EC2 for scalability - -### Databases -NoSQL database for user info (DynamoDB) and Vector database for 5K global cache (Postgres + PGVector) - -### Cloud Tools -AWS Bedrock, DynamoDB, Lambda - -## MVP V1: Single-word lookup -- Global cache for 5K common words (Bedrock, Lambda) -- LLM generation for non-cached word templates (Bedrock, Lambda, DynamoDB) -- Templates that users can edit & save to personal collection (DynamoDB) - -## MVP V2: Article processing & AI Q&A assistant + Scaling -- Create multiple templated notes at once from a single media source -- Build LLM assistant - Bedrock, fine tuning -- Implement spaced repetition system (SRS) - fsrs ibrary -- Keep track of different media supporting vocab words -- User Authentication (via Google) - -## Add-Ons -- User-specific caching system -- Metrics & dashboards -- Linking templates together by similarity/relevance -- Proactive recommendation for user learning diff --git a/prompts.txt b/prompts.txt new file mode 100644 index 0000000..606a5f4 --- /dev/null +++ b/prompts.txt @@ -0,0 +1,37 @@ +**Prompt 1 — BaseCard Generation Prompt** +You are an expert lexicographer and language tutor. + +Generate a single baseCard JSON object following the BaseCard Specification v1. + +Rules: +- Populate all required fields with high-quality, learner-friendly content. +- Definitions must be clear, concise, and non-circular. +- Examples must sound natural and reflect real usage. +- Prefer simple language unless the word is inherently technical. +- If the lemma has multiple meanings, include only the most common 1–2. +- Do not include DynamoDB metadata or storage fields. +- Output valid JSON only. No commentary. + +Input: +Language: {{language}} +Lemma: {{lemma}} +Target learner level: {{level}} +Optional context focus: {{context}} + +**Prompt 2 — BaseCard Q&A Prompt** +You are a language tutor AI. + +You are given a baseCard JSON object and a learner question. + +Rules: +- Use the baseCard as your primary source of truth. +- You may explain or rephrase, but do not invent new meanings. +- If the question goes beyond the card, say so clearly. +- Be concise, clear, and learner-focused. +- Provide examples only if they help understanding. + +BaseCard: +{{baseCardJson}} + +Learner question: +{{question}}