Files
platform/services/brain/app/services/classify.py
Yusuf Suleman 2c3f0d263b feat: brain detail sheet — screenshot for links, editable notes, spelling fix
- Link detail: shows screenshot image (clickable to open URL), URL, summary, tags
- Note detail: click note text to edit, save/cancel buttons
- Notes: AI now fixes spelling/grammar instead of rewriting
- AI returns corrected_text field for notes, worker replaces raw_content
- Removed verbose meta grid (folder/confidence/status/saved)
- Folder shown as a pill badge in meta line

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-01 18:27:27 -05:00

130 lines
4.9 KiB
Python

"""OpenAI classification — structured output for folder/tags/title/summary."""
import json
import logging
import httpx
from app.config import OPENAI_API_KEY, OPENAI_MODEL, FOLDERS, TAGS
log = logging.getLogger(__name__)
SYSTEM_PROMPT = f"""You are a classification engine for a personal "second brain" knowledge management system.
Given an item (URL, note, document, or file), you must return structured JSON with:
- folder: exactly 1 from this list: {json.dumps(FOLDERS)}
- tags: exactly 2 or 3 from this list: {json.dumps(TAGS)}
- title: a concise, normalized title (max 80 chars)
- summary: a 1-2 sentence summary of the content (for links/documents only)
- corrected_text: for NOTES ONLY — return the original note text with spelling/grammar fixed. Keep the original meaning, tone, and structure. Only fix typos and obvious errors. Return empty string for non-notes.
- confidence: a float 0.0-1.0 indicating how confident you are
Rules:
- NEVER invent folders or tags not in the lists above
- NEVER skip classification
- NEVER return freeform text outside the schema
- For notes: do NOT summarize. Keep the original text. Only fix spelling.
- For notes: the summary field should be a very short 5-10 word description, not a rewrite.
- Always return valid JSON matching the schema exactly"""
RESPONSE_SCHEMA = {
"type": "json_schema",
"json_schema": {
"name": "classification",
"strict": True,
"schema": {
"type": "object",
"properties": {
"folder": {"type": "string", "enum": FOLDERS},
"tags": {
"type": "array",
"items": {"type": "string", "enum": TAGS},
"minItems": 2,
"maxItems": 3,
},
"title": {"type": "string"},
"summary": {"type": "string"},
"corrected_text": {"type": "string"},
"confidence": {"type": "number"},
},
"required": ["folder", "tags", "title", "summary", "corrected_text", "confidence"],
"additionalProperties": False,
},
},
}
def build_user_prompt(item_type: str, url: str | None, title: str | None, text: str | None) -> str:
parts = [f"Item type: {item_type}"]
if url:
parts.append(f"URL: {url}")
if title:
parts.append(f"Original title: {title}")
if text:
# Truncate to ~4000 chars for context window efficiency
truncated = text[:4000]
parts.append(f"Content:\n{truncated}")
return "\n\n".join(parts)
async def classify_item(
item_type: str,
url: str | None = None,
title: str | None = None,
text: str | None = None,
retries: int = 2,
) -> dict:
"""Call OpenAI to classify an item. Returns dict with folder, tags, title, summary, confidence."""
if not OPENAI_API_KEY:
log.warning("No OPENAI_API_KEY set, returning defaults")
return {
"folder": "Knowledge",
"tags": ["reference", "read-later"],
"title": title or "Untitled",
"summary": "No AI classification available",
"confidence": 0.0,
}
user_msg = build_user_prompt(item_type, url, title, text)
for attempt in range(retries + 1):
try:
async with httpx.AsyncClient(timeout=30) as client:
resp = await client.post(
"https://api.openai.com/v1/chat/completions",
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
json={
"model": OPENAI_MODEL,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_msg},
],
"response_format": RESPONSE_SCHEMA,
"temperature": 0.2,
},
)
resp.raise_for_status()
data = resp.json()
content = data["choices"][0]["message"]["content"]
result = json.loads(content)
# Validate folder and tags are in allowed sets
if result["folder"] not in FOLDERS:
result["folder"] = "Knowledge"
result["tags"] = [t for t in result["tags"] if t in TAGS][:3]
if len(result["tags"]) < 2:
result["tags"] = (result["tags"] + ["reference", "read-later"])[:3]
return result
except Exception as e:
log.error(f"Classification attempt {attempt + 1} failed: {e}")
if attempt == retries:
return {
"folder": "Knowledge",
"tags": ["reference", "read-later"],
"title": title or "Untitled",
"summary": f"Classification failed: {e}",
"confidence": 0.0,
}