platform/services/brain/app/services/classify.py

"""OpenAI classification — structured output for folder/tags/title/summary."""

import json
import logging

import httpx

from app.config import OPENAI_API_KEY, OPENAI_MODEL, FOLDERS, TAGS

log = logging.getLogger(__name__)

SYSTEM_PROMPT = f"""You are a classification engine for a personal "second brain" knowledge management system.

Given an item (URL, note, document, or file), you must return structured JSON with:
- folder: exactly 1 from this list: {json.dumps(FOLDERS)}
- tags: exactly 2 or 3 from this list: {json.dumps(TAGS)}
- title: a concise, normalized title (max 80 chars)
- summary: a 1-2 sentence summary of the content (for links/documents only)
- corrected_text: for NOTES ONLY — return the original note text with spelling/grammar fixed. Keep the original meaning, tone, and structure. Only fix typos and obvious errors. Return empty string for non-notes.
- confidence: a float 0.0-1.0 indicating how confident you are

Rules:
- NEVER invent folders or tags not in the lists above
- NEVER skip classification
- NEVER return freeform text outside the schema
- For notes: do NOT summarize. Keep the original text. Only fix spelling.
- For notes: the summary field should be a very short 5-10 word description, not a rewrite.
- Always return valid JSON matching the schema exactly"""

RESPONSE_SCHEMA = {
    "type": "json_schema",
    "json_schema": {
        "name": "classification",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "folder": {"type": "string", "enum": FOLDERS},
                "tags": {
                    "type": "array",
                    "items": {"type": "string", "enum": TAGS},
                    "minItems": 2,
                    "maxItems": 3,
                },
                "title": {"type": "string"},
                "summary": {"type": "string"},
                "corrected_text": {"type": "string"},
                "confidence": {"type": "number"},
            },
            "required": ["folder", "tags", "title", "summary", "corrected_text", "confidence"],
            "additionalProperties": False,
        },
    },
}


def build_user_prompt(item_type: str, url: str | None, title: str | None, text: str | None) -> str:
    parts = [f"Item type: {item_type}"]
    if url:
        parts.append(f"URL: {url}")
    if title:
        parts.append(f"Original title: {title}")
    if text:
        # Truncate to ~4000 chars for context window efficiency
        truncated = text[:4000]
        parts.append(f"Content:\n{truncated}")
    return "\n\n".join(parts)


async def classify_item(
    item_type: str,
    url: str | None = None,
    title: str | None = None,
    text: str | None = None,
    retries: int = 2,
) -> dict:
    """Call OpenAI to classify an item. Returns dict with folder, tags, title, summary, confidence."""
    if not OPENAI_API_KEY:
        log.warning("No OPENAI_API_KEY set, returning defaults")
        return {
            "folder": "Knowledge",
            "tags": ["reference", "read-later"],
            "title": title or "Untitled",
            "summary": "No AI classification available",
            "confidence": 0.0,
        }

    user_msg = build_user_prompt(item_type, url, title, text)

    for attempt in range(retries + 1):
        try:
            async with httpx.AsyncClient(timeout=30) as client:
                resp = await client.post(
                    "https://api.openai.com/v1/chat/completions",
                    headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
                    json={
                        "model": OPENAI_MODEL,
                        "messages": [
                            {"role": "system", "content": SYSTEM_PROMPT},
                            {"role": "user", "content": user_msg},
                        ],
                        "response_format": RESPONSE_SCHEMA,
                        "temperature": 0.2,
                    },
                )
                resp.raise_for_status()
                data = resp.json()
                content = data["choices"][0]["message"]["content"]
                result = json.loads(content)

                # Validate folder and tags are in allowed sets
                if result["folder"] not in FOLDERS:
                    result["folder"] = "Knowledge"
                result["tags"] = [t for t in result["tags"] if t in TAGS][:3]
                if len(result["tags"]) < 2:
                    result["tags"] = (result["tags"] + ["reference", "read-later"])[:3]

                return result

        except Exception as e:
            log.error(f"Classification attempt {attempt + 1} failed: {e}")
            if attempt == retries:
                return {
                    "folder": "Knowledge",
                    "tags": ["reference", "read-later"],
                    "title": title or "Untitled",
                    "summary": f"Classification failed: {e}",
                    "confidence": 0.0,
                }