feat: brain service — self-contained second brain knowledge manager

Full backend service with: - FastAPI REST API with CRUD, search, reprocess endpoints - PostgreSQL + pgvector for items and semantic search - Redis + RQ for background job processing - Meilisearch for fast keyword/filter search - Browserless/Chrome for JS rendering and screenshots - OpenAI structured output for AI classification - Local file storage with S3-ready abstraction - Gateway auth via X-Gateway-User-Id header - Own docker-compose stack (6 containers) Classification: fixed folders (Home/Family/Work/Travel/Knowledge/Faith/Projects) and fixed tags (28 predefined). AI assigns exactly 1 folder, 2-3 tags, title, summary, and confidence score per item. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-01 11:48:29 -05:00
parent 51a8157fd4
commit 8275f3a71b
73 changed files with 24081 additions and 4209 deletions
--- a/services/brain/app/services/init.py
+++ b/services/brain/app/services/init.py
--- a/services/brain/app/services/classify.py
+++ b/services/brain/app/services/classify.py
@@ -0,0 +1,125 @@
+"""OpenAI classification — structured output for folder/tags/title/summary."""
+
+import json
+import logging
+
+import httpx
+
+from app.config import OPENAI_API_KEY, OPENAI_MODEL, FOLDERS, TAGS
+
+log = logging.getLogger(__name__)
+
+SYSTEM_PROMPT = f"""You are a classification engine for a personal "second brain" knowledge management system.
+
+Given an item (URL, note, document, or file), you must return structured JSON with:
+- folder: exactly 1 from this list: {json.dumps(FOLDERS)}
+- tags: exactly 2 or 3 from this list: {json.dumps(TAGS)}
+- title: a concise, normalized title (max 80 chars)
+- summary: a 1-2 sentence summary of the content
+- confidence: a float 0.0-1.0 indicating how confident you are
+
+Rules:
+- NEVER invent folders or tags not in the lists above
+- NEVER skip classification
+- NEVER return freeform text outside the schema
+- Always return valid JSON matching the schema exactly"""
+
+RESPONSE_SCHEMA = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "classification",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "properties": {
+                "folder": {"type": "string", "enum": FOLDERS},
+                "tags": {
+                    "type": "array",
+                    "items": {"type": "string", "enum": TAGS},
+                    "minItems": 2,
+                    "maxItems": 3,
+                },
+                "title": {"type": "string"},
+                "summary": {"type": "string"},
+                "confidence": {"type": "number"},
+            },
+            "required": ["folder", "tags", "title", "summary", "confidence"],
+            "additionalProperties": False,
+        },
+    },
+}
+
+
+def build_user_prompt(item_type: str, url: str | None, title: str | None, text: str | None) -> str:
+    parts = [f"Item type: {item_type}"]
+    if url:
+        parts.append(f"URL: {url}")
+    if title:
+        parts.append(f"Original title: {title}")
+    if text:
+        # Truncate to ~4000 chars for context window efficiency
+        truncated = text[:4000]
+        parts.append(f"Content:\n{truncated}")
+    return "\n\n".join(parts)
+
+
+async def classify_item(
+    item_type: str,
+    url: str | None = None,
+    title: str | None = None,
+    text: str | None = None,
+    retries: int = 2,
+) -> dict:
+    """Call OpenAI to classify an item. Returns dict with folder, tags, title, summary, confidence."""
+    if not OPENAI_API_KEY:
+        log.warning("No OPENAI_API_KEY set, returning defaults")
+        return {
+            "folder": "Knowledge",
+            "tags": ["reference", "read-later"],
+            "title": title or "Untitled",
+            "summary": "No AI classification available",
+            "confidence": 0.0,
+        }
+
+    user_msg = build_user_prompt(item_type, url, title, text)
+
+    for attempt in range(retries + 1):
+        try:
+            async with httpx.AsyncClient(timeout=30) as client:
+                resp = await client.post(
+                    "https://api.openai.com/v1/chat/completions",
+                    headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
+                    json={
+                        "model": OPENAI_MODEL,
+                        "messages": [
+                            {"role": "system", "content": SYSTEM_PROMPT},
+                            {"role": "user", "content": user_msg},
+                        ],
+                        "response_format": RESPONSE_SCHEMA,
+                        "temperature": 0.2,
+                    },
+                )
+                resp.raise_for_status()
+                data = resp.json()
+                content = data["choices"][0]["message"]["content"]
+                result = json.loads(content)
+
+                # Validate folder and tags are in allowed sets
+                if result["folder"] not in FOLDERS:
+                    result["folder"] = "Knowledge"
+                result["tags"] = [t for t in result["tags"] if t in TAGS][:3]
+                if len(result["tags"]) < 2:
+                    result["tags"] = (result["tags"] + ["reference", "read-later"])[:3]
+
+                return result
+
+        except Exception as e:
+            log.error(f"Classification attempt {attempt + 1} failed: {e}")
+            if attempt == retries:
+                return {
+                    "folder": "Knowledge",
+                    "tags": ["reference", "read-later"],
+                    "title": title or "Untitled",
+                    "summary": f"Classification failed: {e}",
+                    "confidence": 0.0,
+                }
--- a/services/brain/app/services/embed.py
+++ b/services/brain/app/services/embed.py
@@ -0,0 +1,36 @@
+"""Embedding generation via OpenAI text-embedding API."""
+
+import logging
+
+import httpx
+
+from app.config import OPENAI_API_KEY, OPENAI_EMBED_MODEL, OPENAI_EMBED_DIM
+
+log = logging.getLogger(__name__)
+
+
+async def generate_embedding(text: str) -> list[float] | None:
+    """Generate a vector embedding for the given text. Returns list of floats or None on failure."""
+    if not OPENAI_API_KEY or not text.strip():
+        return None
+
+    # Truncate to ~8000 chars for embedding model token limit
+    truncated = text[:8000]
+
+    try:
+        async with httpx.AsyncClient(timeout=20) as client:
+            resp = await client.post(
+                "https://api.openai.com/v1/embeddings",
+                headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
+                json={
+                    "model": OPENAI_EMBED_MODEL,
+                    "input": truncated,
+                    "dimensions": OPENAI_EMBED_DIM,
+                },
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            return data["data"][0]["embedding"]
+    except Exception as e:
+        log.error(f"Embedding generation failed: {e}")
+        return None
--- a/services/brain/app/services/ingest.py
+++ b/services/brain/app/services/ingest.py
@@ -0,0 +1,164 @@
+"""Content ingestion — fetch, extract, screenshot, archive."""
+
+import logging
+import re
+import uuid
+from html.parser import HTMLParser
+from io import StringIO
+from urllib.parse import urlparse
+
+import httpx
+
+from app.config import BROWSERLESS_URL
+from app.services.storage import storage
+
+log = logging.getLogger(__name__)
+
+
+class _HTMLTextExtractor(HTMLParser):
+    """Simple HTML to text converter."""
+    def __init__(self):
+        super().__init__()
+        self._result = StringIO()
+        self._skip = False
+        self._skip_tags = {"script", "style", "noscript", "svg"}
+
+    def handle_starttag(self, tag, attrs):
+        if tag in self._skip_tags:
+            self._skip = True
+
+    def handle_endtag(self, tag):
+        if tag in self._skip_tags:
+            self._skip = False
+        if tag in ("p", "div", "br", "h1", "h2", "h3", "h4", "li", "tr"):
+            self._result.write("\n")
+
+    def handle_data(self, data):
+        if not self._skip:
+            self._result.write(data)
+
+    def get_text(self) -> str:
+        raw = self._result.getvalue()
+        # Collapse whitespace
+        lines = [line.strip() for line in raw.splitlines()]
+        return "\n".join(line for line in lines if line)
+
+
+def html_to_text(html: str) -> str:
+    extractor = _HTMLTextExtractor()
+    extractor.feed(html)
+    return extractor.get_text()
+
+
+def extract_title_from_html(html: str) -> str | None:
+    match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
+    return match.group(1).strip() if match else None
+
+
+def extract_meta_description(html: str) -> str | None:
+    match = re.search(
+        r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
+        html, re.IGNORECASE | re.DOTALL,
+    )
+    return match.group(1).strip() if match else None
+
+
+async def fetch_url_content(url: str) -> dict:
+    """Fetch URL content. Returns dict with html, text, title, description, used_browserless."""
+    result = {"html": None, "text": None, "title": None, "description": None, "used_browserless": False}
+
+    # Try HTTP-first extraction
+    try:
+        async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
+            resp = await client.get(url, headers={
+                "User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
+            })
+            resp.raise_for_status()
+            html = resp.text
+            result["html"] = html
+            result["text"] = html_to_text(html)
+            result["title"] = extract_title_from_html(html)
+            result["description"] = extract_meta_description(html)
+
+            # If extraction is weak (< 200 chars of text), try browserless
+            if len(result["text"] or "") < 200:
+                log.info(f"Weak extraction ({len(result['text'] or '')} chars), trying browserless")
+                br = await fetch_with_browserless(url)
+                if br and len(br.get("text", "")) > len(result["text"] or ""):
+                    result.update(br)
+                    result["used_browserless"] = True
+
+    except Exception as e:
+        log.warning(f"HTTP fetch failed for {url}: {e}, trying browserless")
+        try:
+            br = await fetch_with_browserless(url)
+            if br:
+                result.update(br)
+                result["used_browserless"] = True
+        except Exception as e2:
+            log.error(f"Browserless also failed for {url}: {e2}")
+
+    return result
+
+
+async def fetch_with_browserless(url: str) -> dict | None:
+    """Use browserless/chrome to render JS-heavy pages."""
+    try:
+        async with httpx.AsyncClient(timeout=30) as client:
+            resp = await client.post(
+                f"{BROWSERLESS_URL}/content",
+                json={"url": url, "waitForTimeout": 3000},
+            )
+            if resp.status_code == 200:
+                html = resp.text
+                return {
+                    "html": html,
+                    "text": html_to_text(html),
+                    "title": extract_title_from_html(html),
+                    "description": extract_meta_description(html),
+                }
+    except Exception as e:
+        log.error(f"Browserless fetch failed: {e}")
+    return None
+
+
+async def take_screenshot(url: str, item_id: str) -> str | None:
+    """Take a screenshot of a URL using browserless. Returns storage path or None."""
+    try:
+        async with httpx.AsyncClient(timeout=30) as client:
+            resp = await client.post(
+                f"{BROWSERLESS_URL}/screenshot",
+                json={
+                    "url": url,
+                    "options": {"type": "png", "fullPage": False},
+                    "waitForTimeout": 3000,
+                },
+            )
+            if resp.status_code == 200:
+                path = storage.save(
+                    item_id=item_id,
+                    asset_type="screenshot",
+                    filename="screenshot.png",
+                    data=resp.content,
+                )
+                return path
+    except Exception as e:
+        log.error(f"Screenshot failed for {url}: {e}")
+    return None
+
+
+async def archive_html(html: str, item_id: str) -> str | None:
+    """Save the full HTML as an archived asset."""
+    if not html:
+        return None
+    try:
+        path = storage.save(
+            item_id=item_id,
+            asset_type="archived_html",
+            filename="page.html",
+            data=html.encode("utf-8"),
+        )
+        return path
+    except Exception as e:
+        log.error(f"HTML archive failed: {e}")
+    return None
--- a/services/brain/app/services/storage.py
+++ b/services/brain/app/services/storage.py
@@ -0,0 +1,81 @@
+"""File storage abstraction — local disk first, S3-ready interface."""
+
+import os
+import shutil
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+from app.config import STORAGE_BACKEND, STORAGE_LOCAL_PATH
+
+
+class StorageBackend(ABC):
+    @abstractmethod
+    def save(self, item_id: str, asset_type: str, filename: str, data: bytes) -> str:
+        """Save file, return relative storage path."""
+        ...
+
+    @abstractmethod
+    def read(self, path: str) -> bytes:
+        ...
+
+    @abstractmethod
+    def delete(self, path: str) -> None:
+        ...
+
+    @abstractmethod
+    def exists(self, path: str) -> bool:
+        ...
+
+    @abstractmethod
+    def url(self, path: str) -> str:
+        """Return a URL or local path for serving."""
+        ...
+
+
+class LocalStorage(StorageBackend):
+    def __init__(self, base_path: str):
+        self.base = Path(base_path)
+        self.base.mkdir(parents=True, exist_ok=True)
+
+    def _full_path(self, path: str) -> Path:
+        return self.base / path
+
+    def save(self, item_id: str, asset_type: str, filename: str, data: bytes) -> str:
+        rel = f"{item_id}/{asset_type}/{filename}"
+        full = self._full_path(rel)
+        full.parent.mkdir(parents=True, exist_ok=True)
+        full.write_bytes(data)
+        return rel
+
+    def read(self, path: str) -> bytes:
+        return self._full_path(path).read_bytes()
+
+    def delete(self, path: str) -> None:
+        full = self._full_path(path)
+        if full.exists():
+            full.unlink()
+        # Clean empty parent dirs
+        parent = full.parent
+        while parent != self.base:
+            try:
+                parent.rmdir()
+                parent = parent.parent
+            except OSError:
+                break
+
+    def exists(self, path: str) -> bool:
+        return self._full_path(path).exists()
+
+    def url(self, path: str) -> str:
+        return f"/storage/{path}"
+
+
+# Future: S3Storage class implementing the same interface
+
+def _create_storage() -> StorageBackend:
+    if STORAGE_BACKEND == "local":
+        return LocalStorage(STORAGE_LOCAL_PATH)
+    raise ValueError(f"Unknown storage backend: {STORAGE_BACKEND}")
+
+
+storage = _create_storage()