feat: brain service — self-contained second brain knowledge manager

Full backend service with:
- FastAPI REST API with CRUD, search, reprocess endpoints
- PostgreSQL + pgvector for items and semantic search
- Redis + RQ for background job processing
- Meilisearch for fast keyword/filter search
- Browserless/Chrome for JS rendering and screenshots
- OpenAI structured output for AI classification
- Local file storage with S3-ready abstraction
- Gateway auth via X-Gateway-User-Id header
- Own docker-compose stack (6 containers)

Classification: fixed folders (Home/Family/Work/Travel/Knowledge/Faith/Projects)
and fixed tags (28 predefined). AI assigns exactly 1 folder, 2-3 tags, title,
summary, and confidence score per item.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Yusuf Suleman
2026-04-01 11:48:29 -05:00
parent 51a8157fd4
commit 8275f3a71b
73 changed files with 24081 additions and 4209 deletions

View File

View File

@@ -0,0 +1,125 @@
"""OpenAI classification — structured output for folder/tags/title/summary."""
import json
import logging
import httpx
from app.config import OPENAI_API_KEY, OPENAI_MODEL, FOLDERS, TAGS
log = logging.getLogger(__name__)
SYSTEM_PROMPT = f"""You are a classification engine for a personal "second brain" knowledge management system.
Given an item (URL, note, document, or file), you must return structured JSON with:
- folder: exactly 1 from this list: {json.dumps(FOLDERS)}
- tags: exactly 2 or 3 from this list: {json.dumps(TAGS)}
- title: a concise, normalized title (max 80 chars)
- summary: a 1-2 sentence summary of the content
- confidence: a float 0.0-1.0 indicating how confident you are
Rules:
- NEVER invent folders or tags not in the lists above
- NEVER skip classification
- NEVER return freeform text outside the schema
- Always return valid JSON matching the schema exactly"""
RESPONSE_SCHEMA = {
"type": "json_schema",
"json_schema": {
"name": "classification",
"strict": True,
"schema": {
"type": "object",
"properties": {
"folder": {"type": "string", "enum": FOLDERS},
"tags": {
"type": "array",
"items": {"type": "string", "enum": TAGS},
"minItems": 2,
"maxItems": 3,
},
"title": {"type": "string"},
"summary": {"type": "string"},
"confidence": {"type": "number"},
},
"required": ["folder", "tags", "title", "summary", "confidence"],
"additionalProperties": False,
},
},
}
def build_user_prompt(item_type: str, url: str | None, title: str | None, text: str | None) -> str:
parts = [f"Item type: {item_type}"]
if url:
parts.append(f"URL: {url}")
if title:
parts.append(f"Original title: {title}")
if text:
# Truncate to ~4000 chars for context window efficiency
truncated = text[:4000]
parts.append(f"Content:\n{truncated}")
return "\n\n".join(parts)
async def classify_item(
item_type: str,
url: str | None = None,
title: str | None = None,
text: str | None = None,
retries: int = 2,
) -> dict:
"""Call OpenAI to classify an item. Returns dict with folder, tags, title, summary, confidence."""
if not OPENAI_API_KEY:
log.warning("No OPENAI_API_KEY set, returning defaults")
return {
"folder": "Knowledge",
"tags": ["reference", "read-later"],
"title": title or "Untitled",
"summary": "No AI classification available",
"confidence": 0.0,
}
user_msg = build_user_prompt(item_type, url, title, text)
for attempt in range(retries + 1):
try:
async with httpx.AsyncClient(timeout=30) as client:
resp = await client.post(
"https://api.openai.com/v1/chat/completions",
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
json={
"model": OPENAI_MODEL,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_msg},
],
"response_format": RESPONSE_SCHEMA,
"temperature": 0.2,
},
)
resp.raise_for_status()
data = resp.json()
content = data["choices"][0]["message"]["content"]
result = json.loads(content)
# Validate folder and tags are in allowed sets
if result["folder"] not in FOLDERS:
result["folder"] = "Knowledge"
result["tags"] = [t for t in result["tags"] if t in TAGS][:3]
if len(result["tags"]) < 2:
result["tags"] = (result["tags"] + ["reference", "read-later"])[:3]
return result
except Exception as e:
log.error(f"Classification attempt {attempt + 1} failed: {e}")
if attempt == retries:
return {
"folder": "Knowledge",
"tags": ["reference", "read-later"],
"title": title or "Untitled",
"summary": f"Classification failed: {e}",
"confidence": 0.0,
}

View File

@@ -0,0 +1,36 @@
"""Embedding generation via OpenAI text-embedding API."""
import logging
import httpx
from app.config import OPENAI_API_KEY, OPENAI_EMBED_MODEL, OPENAI_EMBED_DIM
log = logging.getLogger(__name__)
async def generate_embedding(text: str) -> list[float] | None:
"""Generate a vector embedding for the given text. Returns list of floats or None on failure."""
if not OPENAI_API_KEY or not text.strip():
return None
# Truncate to ~8000 chars for embedding model token limit
truncated = text[:8000]
try:
async with httpx.AsyncClient(timeout=20) as client:
resp = await client.post(
"https://api.openai.com/v1/embeddings",
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
json={
"model": OPENAI_EMBED_MODEL,
"input": truncated,
"dimensions": OPENAI_EMBED_DIM,
},
)
resp.raise_for_status()
data = resp.json()
return data["data"][0]["embedding"]
except Exception as e:
log.error(f"Embedding generation failed: {e}")
return None

View File

@@ -0,0 +1,164 @@
"""Content ingestion — fetch, extract, screenshot, archive."""
import logging
import re
import uuid
from html.parser import HTMLParser
from io import StringIO
from urllib.parse import urlparse
import httpx
from app.config import BROWSERLESS_URL
from app.services.storage import storage
log = logging.getLogger(__name__)
class _HTMLTextExtractor(HTMLParser):
"""Simple HTML to text converter."""
def __init__(self):
super().__init__()
self._result = StringIO()
self._skip = False
self._skip_tags = {"script", "style", "noscript", "svg"}
def handle_starttag(self, tag, attrs):
if tag in self._skip_tags:
self._skip = True
def handle_endtag(self, tag):
if tag in self._skip_tags:
self._skip = False
if tag in ("p", "div", "br", "h1", "h2", "h3", "h4", "li", "tr"):
self._result.write("\n")
def handle_data(self, data):
if not self._skip:
self._result.write(data)
def get_text(self) -> str:
raw = self._result.getvalue()
# Collapse whitespace
lines = [line.strip() for line in raw.splitlines()]
return "\n".join(line for line in lines if line)
def html_to_text(html: str) -> str:
extractor = _HTMLTextExtractor()
extractor.feed(html)
return extractor.get_text()
def extract_title_from_html(html: str) -> str | None:
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
return match.group(1).strip() if match else None
def extract_meta_description(html: str) -> str | None:
match = re.search(
r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
html, re.IGNORECASE | re.DOTALL,
)
return match.group(1).strip() if match else None
async def fetch_url_content(url: str) -> dict:
"""Fetch URL content. Returns dict with html, text, title, description, used_browserless."""
result = {"html": None, "text": None, "title": None, "description": None, "used_browserless": False}
# Try HTTP-first extraction
try:
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
resp = await client.get(url, headers={
"User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
})
resp.raise_for_status()
html = resp.text
result["html"] = html
result["text"] = html_to_text(html)
result["title"] = extract_title_from_html(html)
result["description"] = extract_meta_description(html)
# If extraction is weak (< 200 chars of text), try browserless
if len(result["text"] or "") < 200:
log.info(f"Weak extraction ({len(result['text'] or '')} chars), trying browserless")
br = await fetch_with_browserless(url)
if br and len(br.get("text", "")) > len(result["text"] or ""):
result.update(br)
result["used_browserless"] = True
except Exception as e:
log.warning(f"HTTP fetch failed for {url}: {e}, trying browserless")
try:
br = await fetch_with_browserless(url)
if br:
result.update(br)
result["used_browserless"] = True
except Exception as e2:
log.error(f"Browserless also failed for {url}: {e2}")
return result
async def fetch_with_browserless(url: str) -> dict | None:
"""Use browserless/chrome to render JS-heavy pages."""
try:
async with httpx.AsyncClient(timeout=30) as client:
resp = await client.post(
f"{BROWSERLESS_URL}/content",
json={"url": url, "waitForTimeout": 3000},
)
if resp.status_code == 200:
html = resp.text
return {
"html": html,
"text": html_to_text(html),
"title": extract_title_from_html(html),
"description": extract_meta_description(html),
}
except Exception as e:
log.error(f"Browserless fetch failed: {e}")
return None
async def take_screenshot(url: str, item_id: str) -> str | None:
"""Take a screenshot of a URL using browserless. Returns storage path or None."""
try:
async with httpx.AsyncClient(timeout=30) as client:
resp = await client.post(
f"{BROWSERLESS_URL}/screenshot",
json={
"url": url,
"options": {"type": "png", "fullPage": False},
"waitForTimeout": 3000,
},
)
if resp.status_code == 200:
path = storage.save(
item_id=item_id,
asset_type="screenshot",
filename="screenshot.png",
data=resp.content,
)
return path
except Exception as e:
log.error(f"Screenshot failed for {url}: {e}")
return None
async def archive_html(html: str, item_id: str) -> str | None:
"""Save the full HTML as an archived asset."""
if not html:
return None
try:
path = storage.save(
item_id=item_id,
asset_type="archived_html",
filename="page.html",
data=html.encode("utf-8"),
)
return path
except Exception as e:
log.error(f"HTML archive failed: {e}")
return None

View File

@@ -0,0 +1,81 @@
"""File storage abstraction — local disk first, S3-ready interface."""
import os
import shutil
from abc import ABC, abstractmethod
from pathlib import Path
from app.config import STORAGE_BACKEND, STORAGE_LOCAL_PATH
class StorageBackend(ABC):
@abstractmethod
def save(self, item_id: str, asset_type: str, filename: str, data: bytes) -> str:
"""Save file, return relative storage path."""
...
@abstractmethod
def read(self, path: str) -> bytes:
...
@abstractmethod
def delete(self, path: str) -> None:
...
@abstractmethod
def exists(self, path: str) -> bool:
...
@abstractmethod
def url(self, path: str) -> str:
"""Return a URL or local path for serving."""
...
class LocalStorage(StorageBackend):
def __init__(self, base_path: str):
self.base = Path(base_path)
self.base.mkdir(parents=True, exist_ok=True)
def _full_path(self, path: str) -> Path:
return self.base / path
def save(self, item_id: str, asset_type: str, filename: str, data: bytes) -> str:
rel = f"{item_id}/{asset_type}/{filename}"
full = self._full_path(rel)
full.parent.mkdir(parents=True, exist_ok=True)
full.write_bytes(data)
return rel
def read(self, path: str) -> bytes:
return self._full_path(path).read_bytes()
def delete(self, path: str) -> None:
full = self._full_path(path)
if full.exists():
full.unlink()
# Clean empty parent dirs
parent = full.parent
while parent != self.base:
try:
parent.rmdir()
parent = parent.parent
except OSError:
break
def exists(self, path: str) -> bool:
return self._full_path(path).exists()
def url(self, path: str) -> str:
return f"/storage/{path}"
# Future: S3Storage class implementing the same interface
def _create_storage() -> StorageBackend:
if STORAGE_BACKEND == "local":
return LocalStorage(STORAGE_LOCAL_PATH)
raise ValueError(f"Unknown storage backend: {STORAGE_BACKEND}")
storage = _create_storage()