feat: brain service — self-contained second brain knowledge manager
Full backend service with: - FastAPI REST API with CRUD, search, reprocess endpoints - PostgreSQL + pgvector for items and semantic search - Redis + RQ for background job processing - Meilisearch for fast keyword/filter search - Browserless/Chrome for JS rendering and screenshots - OpenAI structured output for AI classification - Local file storage with S3-ready abstraction - Gateway auth via X-Gateway-User-Id header - Own docker-compose stack (6 containers) Classification: fixed folders (Home/Family/Work/Travel/Knowledge/Faith/Projects) and fixed tags (28 predefined). AI assigns exactly 1 folder, 2-3 tags, title, summary, and confidence score per item. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
0
services/brain/app/services/__init__.py
Normal file
0
services/brain/app/services/__init__.py
Normal file
125
services/brain/app/services/classify.py
Normal file
125
services/brain/app/services/classify.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""OpenAI classification — structured output for folder/tags/title/summary."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import OPENAI_API_KEY, OPENAI_MODEL, FOLDERS, TAGS
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
SYSTEM_PROMPT = f"""You are a classification engine for a personal "second brain" knowledge management system.
|
||||
|
||||
Given an item (URL, note, document, or file), you must return structured JSON with:
|
||||
- folder: exactly 1 from this list: {json.dumps(FOLDERS)}
|
||||
- tags: exactly 2 or 3 from this list: {json.dumps(TAGS)}
|
||||
- title: a concise, normalized title (max 80 chars)
|
||||
- summary: a 1-2 sentence summary of the content
|
||||
- confidence: a float 0.0-1.0 indicating how confident you are
|
||||
|
||||
Rules:
|
||||
- NEVER invent folders or tags not in the lists above
|
||||
- NEVER skip classification
|
||||
- NEVER return freeform text outside the schema
|
||||
- Always return valid JSON matching the schema exactly"""
|
||||
|
||||
RESPONSE_SCHEMA = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "classification",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"folder": {"type": "string", "enum": FOLDERS},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {"type": "string", "enum": TAGS},
|
||||
"minItems": 2,
|
||||
"maxItems": 3,
|
||||
},
|
||||
"title": {"type": "string"},
|
||||
"summary": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
},
|
||||
"required": ["folder", "tags", "title", "summary", "confidence"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def build_user_prompt(item_type: str, url: str | None, title: str | None, text: str | None) -> str:
|
||||
parts = [f"Item type: {item_type}"]
|
||||
if url:
|
||||
parts.append(f"URL: {url}")
|
||||
if title:
|
||||
parts.append(f"Original title: {title}")
|
||||
if text:
|
||||
# Truncate to ~4000 chars for context window efficiency
|
||||
truncated = text[:4000]
|
||||
parts.append(f"Content:\n{truncated}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
async def classify_item(
|
||||
item_type: str,
|
||||
url: str | None = None,
|
||||
title: str | None = None,
|
||||
text: str | None = None,
|
||||
retries: int = 2,
|
||||
) -> dict:
|
||||
"""Call OpenAI to classify an item. Returns dict with folder, tags, title, summary, confidence."""
|
||||
if not OPENAI_API_KEY:
|
||||
log.warning("No OPENAI_API_KEY set, returning defaults")
|
||||
return {
|
||||
"folder": "Knowledge",
|
||||
"tags": ["reference", "read-later"],
|
||||
"title": title or "Untitled",
|
||||
"summary": "No AI classification available",
|
||||
"confidence": 0.0,
|
||||
}
|
||||
|
||||
user_msg = build_user_prompt(item_type, url, title, text)
|
||||
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
|
||||
json={
|
||||
"model": OPENAI_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_msg},
|
||||
],
|
||||
"response_format": RESPONSE_SCHEMA,
|
||||
"temperature": 0.2,
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
result = json.loads(content)
|
||||
|
||||
# Validate folder and tags are in allowed sets
|
||||
if result["folder"] not in FOLDERS:
|
||||
result["folder"] = "Knowledge"
|
||||
result["tags"] = [t for t in result["tags"] if t in TAGS][:3]
|
||||
if len(result["tags"]) < 2:
|
||||
result["tags"] = (result["tags"] + ["reference", "read-later"])[:3]
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Classification attempt {attempt + 1} failed: {e}")
|
||||
if attempt == retries:
|
||||
return {
|
||||
"folder": "Knowledge",
|
||||
"tags": ["reference", "read-later"],
|
||||
"title": title or "Untitled",
|
||||
"summary": f"Classification failed: {e}",
|
||||
"confidence": 0.0,
|
||||
}
|
||||
36
services/brain/app/services/embed.py
Normal file
36
services/brain/app/services/embed.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""Embedding generation via OpenAI text-embedding API."""
|
||||
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import OPENAI_API_KEY, OPENAI_EMBED_MODEL, OPENAI_EMBED_DIM
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def generate_embedding(text: str) -> list[float] | None:
|
||||
"""Generate a vector embedding for the given text. Returns list of floats or None on failure."""
|
||||
if not OPENAI_API_KEY or not text.strip():
|
||||
return None
|
||||
|
||||
# Truncate to ~8000 chars for embedding model token limit
|
||||
truncated = text[:8000]
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=20) as client:
|
||||
resp = await client.post(
|
||||
"https://api.openai.com/v1/embeddings",
|
||||
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
|
||||
json={
|
||||
"model": OPENAI_EMBED_MODEL,
|
||||
"input": truncated,
|
||||
"dimensions": OPENAI_EMBED_DIM,
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["data"][0]["embedding"]
|
||||
except Exception as e:
|
||||
log.error(f"Embedding generation failed: {e}")
|
||||
return None
|
||||
164
services/brain/app/services/ingest.py
Normal file
164
services/brain/app/services/ingest.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""Content ingestion — fetch, extract, screenshot, archive."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from html.parser import HTMLParser
|
||||
from io import StringIO
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import BROWSERLESS_URL
|
||||
from app.services.storage import storage
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _HTMLTextExtractor(HTMLParser):
|
||||
"""Simple HTML to text converter."""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._result = StringIO()
|
||||
self._skip = False
|
||||
self._skip_tags = {"script", "style", "noscript", "svg"}
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in self._skip_tags:
|
||||
self._skip = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in self._skip_tags:
|
||||
self._skip = False
|
||||
if tag in ("p", "div", "br", "h1", "h2", "h3", "h4", "li", "tr"):
|
||||
self._result.write("\n")
|
||||
|
||||
def handle_data(self, data):
|
||||
if not self._skip:
|
||||
self._result.write(data)
|
||||
|
||||
def get_text(self) -> str:
|
||||
raw = self._result.getvalue()
|
||||
# Collapse whitespace
|
||||
lines = [line.strip() for line in raw.splitlines()]
|
||||
return "\n".join(line for line in lines if line)
|
||||
|
||||
|
||||
def html_to_text(html: str) -> str:
|
||||
extractor = _HTMLTextExtractor()
|
||||
extractor.feed(html)
|
||||
return extractor.get_text()
|
||||
|
||||
|
||||
def extract_title_from_html(html: str) -> str | None:
|
||||
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
|
||||
def extract_meta_description(html: str) -> str | None:
|
||||
match = re.search(
|
||||
r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
|
||||
html, re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
|
||||
async def fetch_url_content(url: str) -> dict:
|
||||
"""Fetch URL content. Returns dict with html, text, title, description, used_browserless."""
|
||||
result = {"html": None, "text": None, "title": None, "description": None, "used_browserless": False}
|
||||
|
||||
# Try HTTP-first extraction
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
|
||||
resp = await client.get(url, headers={
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
|
||||
})
|
||||
resp.raise_for_status()
|
||||
html = resp.text
|
||||
result["html"] = html
|
||||
result["text"] = html_to_text(html)
|
||||
result["title"] = extract_title_from_html(html)
|
||||
result["description"] = extract_meta_description(html)
|
||||
|
||||
# If extraction is weak (< 200 chars of text), try browserless
|
||||
if len(result["text"] or "") < 200:
|
||||
log.info(f"Weak extraction ({len(result['text'] or '')} chars), trying browserless")
|
||||
br = await fetch_with_browserless(url)
|
||||
if br and len(br.get("text", "")) > len(result["text"] or ""):
|
||||
result.update(br)
|
||||
result["used_browserless"] = True
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"HTTP fetch failed for {url}: {e}, trying browserless")
|
||||
try:
|
||||
br = await fetch_with_browserless(url)
|
||||
if br:
|
||||
result.update(br)
|
||||
result["used_browserless"] = True
|
||||
except Exception as e2:
|
||||
log.error(f"Browserless also failed for {url}: {e2}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def fetch_with_browserless(url: str) -> dict | None:
|
||||
"""Use browserless/chrome to render JS-heavy pages."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{BROWSERLESS_URL}/content",
|
||||
json={"url": url, "waitForTimeout": 3000},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
return {
|
||||
"html": html,
|
||||
"text": html_to_text(html),
|
||||
"title": extract_title_from_html(html),
|
||||
"description": extract_meta_description(html),
|
||||
}
|
||||
except Exception as e:
|
||||
log.error(f"Browserless fetch failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def take_screenshot(url: str, item_id: str) -> str | None:
|
||||
"""Take a screenshot of a URL using browserless. Returns storage path or None."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{BROWSERLESS_URL}/screenshot",
|
||||
json={
|
||||
"url": url,
|
||||
"options": {"type": "png", "fullPage": False},
|
||||
"waitForTimeout": 3000,
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
path = storage.save(
|
||||
item_id=item_id,
|
||||
asset_type="screenshot",
|
||||
filename="screenshot.png",
|
||||
data=resp.content,
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"Screenshot failed for {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def archive_html(html: str, item_id: str) -> str | None:
|
||||
"""Save the full HTML as an archived asset."""
|
||||
if not html:
|
||||
return None
|
||||
try:
|
||||
path = storage.save(
|
||||
item_id=item_id,
|
||||
asset_type="archived_html",
|
||||
filename="page.html",
|
||||
data=html.encode("utf-8"),
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"HTML archive failed: {e}")
|
||||
return None
|
||||
81
services/brain/app/services/storage.py
Normal file
81
services/brain/app/services/storage.py
Normal file
@@ -0,0 +1,81 @@
|
||||
"""File storage abstraction — local disk first, S3-ready interface."""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
from app.config import STORAGE_BACKEND, STORAGE_LOCAL_PATH
|
||||
|
||||
|
||||
class StorageBackend(ABC):
|
||||
@abstractmethod
|
||||
def save(self, item_id: str, asset_type: str, filename: str, data: bytes) -> str:
|
||||
"""Save file, return relative storage path."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def read(self, path: str) -> bytes:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def delete(self, path: str) -> None:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def exists(self, path: str) -> bool:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def url(self, path: str) -> str:
|
||||
"""Return a URL or local path for serving."""
|
||||
...
|
||||
|
||||
|
||||
class LocalStorage(StorageBackend):
|
||||
def __init__(self, base_path: str):
|
||||
self.base = Path(base_path)
|
||||
self.base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _full_path(self, path: str) -> Path:
|
||||
return self.base / path
|
||||
|
||||
def save(self, item_id: str, asset_type: str, filename: str, data: bytes) -> str:
|
||||
rel = f"{item_id}/{asset_type}/{filename}"
|
||||
full = self._full_path(rel)
|
||||
full.parent.mkdir(parents=True, exist_ok=True)
|
||||
full.write_bytes(data)
|
||||
return rel
|
||||
|
||||
def read(self, path: str) -> bytes:
|
||||
return self._full_path(path).read_bytes()
|
||||
|
||||
def delete(self, path: str) -> None:
|
||||
full = self._full_path(path)
|
||||
if full.exists():
|
||||
full.unlink()
|
||||
# Clean empty parent dirs
|
||||
parent = full.parent
|
||||
while parent != self.base:
|
||||
try:
|
||||
parent.rmdir()
|
||||
parent = parent.parent
|
||||
except OSError:
|
||||
break
|
||||
|
||||
def exists(self, path: str) -> bool:
|
||||
return self._full_path(path).exists()
|
||||
|
||||
def url(self, path: str) -> str:
|
||||
return f"/storage/{path}"
|
||||
|
||||
|
||||
# Future: S3Storage class implementing the same interface
|
||||
|
||||
def _create_storage() -> StorageBackend:
|
||||
if STORAGE_BACKEND == "local":
|
||||
return LocalStorage(STORAGE_LOCAL_PATH)
|
||||
raise ValueError(f"Unknown storage backend: {STORAGE_BACKEND}")
|
||||
|
||||
|
||||
storage = _create_storage()
|
||||
Reference in New Issue
Block a user