feat: brain service — self-contained second brain knowledge manager
Full backend service with: - FastAPI REST API with CRUD, search, reprocess endpoints - PostgreSQL + pgvector for items and semantic search - Redis + RQ for background job processing - Meilisearch for fast keyword/filter search - Browserless/Chrome for JS rendering and screenshots - OpenAI structured output for AI classification - Local file storage with S3-ready abstraction - Gateway auth via X-Gateway-User-Id header - Own docker-compose stack (6 containers) Classification: fixed folders (Home/Family/Work/Travel/Knowledge/Faith/Projects) and fixed tags (28 predefined). AI assigns exactly 1 folder, 2-3 tags, title, summary, and confidence score per item. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
0
services/brain/app/__init__.py
Normal file
0
services/brain/app/__init__.py
Normal file
0
services/brain/app/api/__init__.py
Normal file
0
services/brain/app/api/__init__.py
Normal file
21
services/brain/app/api/deps.py
Normal file
21
services/brain/app/api/deps.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""API dependencies — auth, database session."""
|
||||
|
||||
from fastapi import Depends, Header, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import get_db
|
||||
|
||||
|
||||
async def get_user_id(
|
||||
x_gateway_user_id: str = Header(None, alias="X-Gateway-User-Id"),
|
||||
) -> str:
|
||||
"""Extract authenticated user ID from gateway-injected header."""
|
||||
if not x_gateway_user_id:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
return x_gateway_user_id
|
||||
|
||||
|
||||
async def get_db_session() -> AsyncSession:
|
||||
"""Provide an async database session."""
|
||||
async for session in get_db():
|
||||
yield session
|
||||
319
services/brain/app/api/routes.py
Normal file
319
services/brain/app/api/routes.py
Normal file
@@ -0,0 +1,319 @@
|
||||
"""Brain API endpoints."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, Query
|
||||
from sqlalchemy import select, func, desc
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.api.deps import get_user_id, get_db_session
|
||||
from app.config import FOLDERS, TAGS
|
||||
from app.models.item import Item, ItemAsset
|
||||
from app.models.schema import (
|
||||
ItemCreate, ItemUpdate, ItemOut, ItemList, SearchQuery, SemanticSearchQuery,
|
||||
HybridSearchQuery, SearchResult, ConfigOut,
|
||||
)
|
||||
from app.services.storage import storage
|
||||
from app.worker.tasks import enqueue_process_item
|
||||
|
||||
router = APIRouter(prefix="/api", tags=["brain"])
|
||||
|
||||
|
||||
# ── Health ──
|
||||
|
||||
@router.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok", "service": "brain"}
|
||||
|
||||
|
||||
# ── Config ──
|
||||
|
||||
@router.get("/config", response_model=ConfigOut)
|
||||
async def get_config():
|
||||
return ConfigOut(folders=FOLDERS, tags=TAGS)
|
||||
|
||||
|
||||
# ── Create item ──
|
||||
|
||||
@router.post("/items", response_model=ItemOut, status_code=201)
|
||||
async def create_item(
|
||||
body: ItemCreate,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
item = Item(
|
||||
id=str(uuid.uuid4()),
|
||||
user_id=user_id,
|
||||
type=body.type,
|
||||
url=body.url,
|
||||
raw_content=body.raw_content,
|
||||
title=body.title,
|
||||
folder=body.folder,
|
||||
tags=body.tags or [],
|
||||
processing_status="pending",
|
||||
)
|
||||
db.add(item)
|
||||
await db.commit()
|
||||
await db.refresh(item, ["assets"])
|
||||
|
||||
# Enqueue background processing
|
||||
enqueue_process_item(item.id)
|
||||
|
||||
return item
|
||||
|
||||
|
||||
# ── Upload file ──
|
||||
|
||||
@router.post("/items/upload", response_model=ItemOut, status_code=201)
|
||||
async def upload_file(
|
||||
file: UploadFile = File(...),
|
||||
title: Optional[str] = Form(None),
|
||||
folder: Optional[str] = Form(None),
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
item_id = str(uuid.uuid4())
|
||||
content_type = file.content_type or "application/octet-stream"
|
||||
|
||||
# Determine type from content_type
|
||||
if content_type.startswith("image/"):
|
||||
item_type = "image"
|
||||
elif content_type == "application/pdf":
|
||||
item_type = "pdf"
|
||||
else:
|
||||
item_type = "file"
|
||||
|
||||
# Store the uploaded file
|
||||
data = await file.read()
|
||||
path = storage.save(
|
||||
item_id=item_id,
|
||||
asset_type="original_upload",
|
||||
filename=file.filename or "upload",
|
||||
data=data,
|
||||
)
|
||||
|
||||
item = Item(
|
||||
id=item_id,
|
||||
user_id=user_id,
|
||||
type=item_type,
|
||||
title=title or file.filename,
|
||||
folder=folder,
|
||||
processing_status="pending",
|
||||
)
|
||||
db.add(item)
|
||||
|
||||
asset = ItemAsset(
|
||||
id=str(uuid.uuid4()),
|
||||
item_id=item_id,
|
||||
asset_type="original_upload",
|
||||
filename=file.filename or "upload",
|
||||
content_type=content_type,
|
||||
size_bytes=len(data),
|
||||
storage_path=path,
|
||||
)
|
||||
db.add(asset)
|
||||
|
||||
await db.commit()
|
||||
await db.refresh(item, ["assets"])
|
||||
|
||||
enqueue_process_item(item.id)
|
||||
return item
|
||||
|
||||
|
||||
# ── Get item ──
|
||||
|
||||
@router.get("/items/{item_id}", response_model=ItemOut)
|
||||
async def get_item(
|
||||
item_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Item).options(selectinload(Item.assets))
|
||||
.where(Item.id == item_id, Item.user_id == user_id)
|
||||
)
|
||||
item = result.scalar_one_or_none()
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Item not found")
|
||||
return item
|
||||
|
||||
|
||||
# ── List items ──
|
||||
|
||||
@router.get("/items", response_model=ItemList)
|
||||
async def list_items(
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
folder: Optional[str] = Query(None),
|
||||
tag: Optional[str] = Query(None),
|
||||
type: Optional[str] = Query(None),
|
||||
status: Optional[str] = Query(None),
|
||||
limit: int = Query(20, le=100),
|
||||
offset: int = Query(0),
|
||||
):
|
||||
q = select(Item).options(selectinload(Item.assets)).where(Item.user_id == user_id)
|
||||
|
||||
if folder:
|
||||
q = q.where(Item.folder == folder)
|
||||
if tag:
|
||||
q = q.where(Item.tags.contains([tag]))
|
||||
if type:
|
||||
q = q.where(Item.type == type)
|
||||
if status:
|
||||
q = q.where(Item.processing_status == status)
|
||||
|
||||
# Count
|
||||
count_q = select(func.count()).select_from(q.subquery())
|
||||
total = (await db.execute(count_q)).scalar() or 0
|
||||
|
||||
# Fetch
|
||||
q = q.order_by(desc(Item.created_at)).offset(offset).limit(limit)
|
||||
result = await db.execute(q)
|
||||
items = result.scalars().all()
|
||||
|
||||
return ItemList(items=items, total=total)
|
||||
|
||||
|
||||
# ── Update item ──
|
||||
|
||||
@router.patch("/items/{item_id}", response_model=ItemOut)
|
||||
async def update_item(
|
||||
item_id: str,
|
||||
body: ItemUpdate,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Item).options(selectinload(Item.assets))
|
||||
.where(Item.id == item_id, Item.user_id == user_id)
|
||||
)
|
||||
item = result.scalar_one_or_none()
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Item not found")
|
||||
|
||||
if body.title is not None:
|
||||
item.title = body.title
|
||||
if body.folder is not None:
|
||||
item.folder = body.folder
|
||||
if body.tags is not None:
|
||||
item.tags = body.tags
|
||||
if body.raw_content is not None:
|
||||
item.raw_content = body.raw_content
|
||||
|
||||
item.updated_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
await db.refresh(item)
|
||||
return item
|
||||
|
||||
|
||||
# ── Delete item ──
|
||||
|
||||
@router.delete("/items/{item_id}")
|
||||
async def delete_item(
|
||||
item_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Item).where(Item.id == item_id, Item.user_id == user_id)
|
||||
)
|
||||
item = result.scalar_one_or_none()
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Item not found")
|
||||
|
||||
# Delete stored assets
|
||||
for asset in (await db.execute(
|
||||
select(ItemAsset).where(ItemAsset.item_id == item_id)
|
||||
)).scalars().all():
|
||||
storage.delete(asset.storage_path)
|
||||
|
||||
await db.delete(item)
|
||||
await db.commit()
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
# ── Reprocess item ──
|
||||
|
||||
@router.post("/items/{item_id}/reprocess", response_model=ItemOut)
|
||||
async def reprocess_item(
|
||||
item_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
result = await db.execute(
|
||||
select(Item).options(selectinload(Item.assets))
|
||||
.where(Item.id == item_id, Item.user_id == user_id)
|
||||
)
|
||||
item = result.scalar_one_or_none()
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Item not found")
|
||||
|
||||
item.processing_status = "pending"
|
||||
item.processing_error = None
|
||||
item.updated_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
|
||||
enqueue_process_item(item.id)
|
||||
return item
|
||||
|
||||
|
||||
# ── Search (keyword via Meilisearch) ──
|
||||
|
||||
@router.post("/search", response_model=SearchResult)
|
||||
async def search_items(
|
||||
body: SearchQuery,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
from app.search.engine import keyword_search
|
||||
item_ids, total = await keyword_search(
|
||||
user_id=user_id, q=body.q, folder=body.folder, tags=body.tags,
|
||||
item_type=body.type, limit=body.limit, offset=body.offset,
|
||||
)
|
||||
if not item_ids:
|
||||
return SearchResult(items=[], total=0, query=body.q)
|
||||
|
||||
result = await db.execute(
|
||||
select(Item).options(selectinload(Item.assets))
|
||||
.where(Item.id.in_(item_ids))
|
||||
)
|
||||
items_map = {i.id: i for i in result.scalars().all()}
|
||||
ordered = [items_map[id] for id in item_ids if id in items_map]
|
||||
return SearchResult(items=ordered, total=total, query=body.q)
|
||||
|
||||
|
||||
# ── Semantic search (pgvector) ──
|
||||
|
||||
@router.post("/search/semantic", response_model=SearchResult)
|
||||
async def semantic_search(
|
||||
body: SemanticSearchQuery,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
from app.search.engine import vector_search
|
||||
items = await vector_search(
|
||||
db=db, user_id=user_id, q=body.q,
|
||||
folder=body.folder, item_type=body.type, limit=body.limit,
|
||||
)
|
||||
return SearchResult(items=items, total=len(items), query=body.q)
|
||||
|
||||
|
||||
# ── Hybrid search ──
|
||||
|
||||
@router.post("/search/hybrid", response_model=SearchResult)
|
||||
async def hybrid_search(
|
||||
body: HybridSearchQuery,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db_session),
|
||||
):
|
||||
from app.search.engine import hybrid_search as do_hybrid
|
||||
items = await do_hybrid(
|
||||
db=db, user_id=user_id, q=body.q,
|
||||
folder=body.folder, tags=body.tags, item_type=body.type, limit=body.limit,
|
||||
)
|
||||
return SearchResult(items=items, total=len(items), query=body.q)
|
||||
55
services/brain/app/config.py
Normal file
55
services/brain/app/config.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Brain service configuration — all from environment variables."""
|
||||
|
||||
import os
|
||||
|
||||
# ── Database ──
|
||||
DATABASE_URL = os.environ.get(
|
||||
"DATABASE_URL",
|
||||
"postgresql+asyncpg://brain:brain@brain-db:5432/brain"
|
||||
)
|
||||
DATABASE_URL_SYNC = DATABASE_URL.replace("+asyncpg", "")
|
||||
|
||||
# ── Redis ──
|
||||
REDIS_URL = os.environ.get("REDIS_URL", "redis://brain-redis:6379/0")
|
||||
|
||||
# ── Meilisearch ──
|
||||
MEILI_URL = os.environ.get("MEILI_URL", "http://brain-meili:7700")
|
||||
MEILI_KEY = os.environ.get("MEILI_MASTER_KEY", "brain-meili-key")
|
||||
MEILI_INDEX = "items"
|
||||
|
||||
# ── Browserless ──
|
||||
BROWSERLESS_URL = os.environ.get("BROWSERLESS_URL", "http://brain-browserless:3000")
|
||||
|
||||
# ── OpenAI ──
|
||||
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
|
||||
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
|
||||
OPENAI_EMBED_MODEL = os.environ.get("OPENAI_EMBED_MODEL", "text-embedding-3-small")
|
||||
OPENAI_EMBED_DIM = int(os.environ.get("OPENAI_EMBED_DIM", "1536"))
|
||||
|
||||
# ── Storage ──
|
||||
STORAGE_BACKEND = os.environ.get("STORAGE_BACKEND", "local") # local | s3
|
||||
STORAGE_LOCAL_PATH = os.environ.get("STORAGE_LOCAL_PATH", "/app/storage")
|
||||
|
||||
# ── S3 (future) ──
|
||||
S3_BUCKET = os.environ.get("S3_BUCKET", "")
|
||||
S3_ENDPOINT = os.environ.get("S3_ENDPOINT", "")
|
||||
S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "")
|
||||
S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "")
|
||||
|
||||
# ── Service ──
|
||||
PORT = int(os.environ.get("PORT", "8200"))
|
||||
DEBUG = os.environ.get("DEBUG", "").lower() in ("1", "true")
|
||||
|
||||
# ── Classification rules ──
|
||||
FOLDERS = [
|
||||
"Home", "Family", "Work", "Travel", "Knowledge", "Faith", "Projects"
|
||||
]
|
||||
|
||||
TAGS = [
|
||||
"reference", "important", "legal", "financial", "insurance",
|
||||
"research", "idea", "guide", "tutorial", "setup", "how-to",
|
||||
"tools", "dev", "server", "selfhosted", "home-assistant",
|
||||
"shopping", "compare", "buy", "product",
|
||||
"family", "kids", "health", "travel", "faith",
|
||||
"video", "read-later", "books",
|
||||
]
|
||||
18
services/brain/app/database.py
Normal file
18
services/brain/app/database.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""Database session and engine setup."""
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
|
||||
from app.config import DATABASE_URL
|
||||
|
||||
engine = create_async_engine(DATABASE_URL, echo=False, pool_size=10, max_overflow=5)
|
||||
async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
async def get_db() -> AsyncSession:
|
||||
async with async_session() as session:
|
||||
yield session
|
||||
41
services/brain/app/main.py
Normal file
41
services/brain/app/main.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Brain service — FastAPI entrypoint."""
|
||||
|
||||
import logging
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from app.api.routes import router
|
||||
from app.config import DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if DEBUG else logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
||||
)
|
||||
|
||||
app = FastAPI(
|
||||
title="Second Brain",
|
||||
description="Save everything. AI classifies it. Search it later.",
|
||||
version="1.0.0",
|
||||
docs_url="/api/docs" if DEBUG else None,
|
||||
redoc_url=None,
|
||||
)
|
||||
|
||||
# No CORS — internal service only, accessed via gateway
|
||||
app.include_router(router)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
from app.database import engine, Base
|
||||
from app.models.item import Item, ItemAsset, AppLink # noqa: import to register models
|
||||
|
||||
# Create tables if they don't exist
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
# Ensure Meilisearch index exists
|
||||
from app.search.engine import ensure_meili_index
|
||||
await ensure_meili_index()
|
||||
|
||||
logging.getLogger(__name__).info("Brain service started")
|
||||
0
services/brain/app/models/__init__.py
Normal file
0
services/brain/app/models/__init__.py
Normal file
80
services/brain/app/models/item.py
Normal file
80
services/brain/app/models/item.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""SQLAlchemy models for the brain service."""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from pgvector.sqlalchemy import Vector
|
||||
from sqlalchemy import (
|
||||
Column, String, Text, Integer, Float, DateTime, ForeignKey, Index, text
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import JSONB, UUID, ARRAY
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from app.config import OPENAI_EMBED_DIM
|
||||
from app.database import Base
|
||||
|
||||
|
||||
def new_id():
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
class Item(Base):
|
||||
__tablename__ = "items"
|
||||
|
||||
id = Column(UUID(as_uuid=False), primary_key=True, default=new_id)
|
||||
user_id = Column(String(64), nullable=False, index=True)
|
||||
type = Column(String(32), nullable=False, default="link") # link|note|pdf|image|document|file
|
||||
title = Column(Text, nullable=True)
|
||||
url = Column(Text, nullable=True)
|
||||
raw_content = Column(Text, nullable=True) # original user input (note body, etc.)
|
||||
extracted_text = Column(Text, nullable=True) # full extracted text from page/doc
|
||||
folder = Column(String(64), nullable=True)
|
||||
tags = Column(ARRAY(String), nullable=True, default=list)
|
||||
summary = Column(Text, nullable=True)
|
||||
confidence = Column(Float, nullable=True)
|
||||
metadata_json = Column(JSONB, nullable=True, default=dict)
|
||||
processing_status = Column(String(32), nullable=False, default="pending") # pending|processing|ready|failed
|
||||
processing_error = Column(Text, nullable=True)
|
||||
|
||||
# Embedding (pgvector)
|
||||
embedding = Column(Vector(OPENAI_EMBED_DIM), nullable=True)
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||||
|
||||
# Relationships
|
||||
assets = relationship("ItemAsset", back_populates="item", cascade="all, delete-orphan")
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_items_user_status", "user_id", "processing_status"),
|
||||
Index("ix_items_user_folder", "user_id", "folder"),
|
||||
Index("ix_items_created", "created_at"),
|
||||
)
|
||||
|
||||
|
||||
class ItemAsset(Base):
|
||||
__tablename__ = "item_assets"
|
||||
|
||||
id = Column(UUID(as_uuid=False), primary_key=True, default=new_id)
|
||||
item_id = Column(UUID(as_uuid=False), ForeignKey("items.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
asset_type = Column(String(32), nullable=False) # screenshot|archived_html|original_upload|extracted_file
|
||||
filename = Column(String(512), nullable=False)
|
||||
content_type = Column(String(128), nullable=True)
|
||||
size_bytes = Column(Integer, nullable=True)
|
||||
storage_path = Column(String(1024), nullable=False) # relative path in storage
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
|
||||
# Relationships
|
||||
item = relationship("Item", back_populates="assets")
|
||||
|
||||
|
||||
class AppLink(Base):
|
||||
"""Placeholder for future cross-app linking (e.g. link a saved item to a trip or task)."""
|
||||
__tablename__ = "app_links"
|
||||
|
||||
id = Column(UUID(as_uuid=False), primary_key=True, default=new_id)
|
||||
item_id = Column(UUID(as_uuid=False), ForeignKey("items.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
app = Column(String(64), nullable=False) # trips|tasks|fitness|inventory
|
||||
app_entity_id = Column(String(128), nullable=False)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
109
services/brain/app/models/schema.py
Normal file
109
services/brain/app/models/schema.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""Pydantic schemas for API request/response."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# ── Request schemas ──
|
||||
|
||||
class ItemCreate(BaseModel):
|
||||
type: str = "link"
|
||||
url: Optional[str] = None
|
||||
raw_content: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
folder: Optional[str] = None
|
||||
tags: Optional[list[str]] = None
|
||||
|
||||
|
||||
class ItemUpdate(BaseModel):
|
||||
title: Optional[str] = None
|
||||
folder: Optional[str] = None
|
||||
tags: Optional[list[str]] = None
|
||||
raw_content: Optional[str] = None
|
||||
|
||||
|
||||
class SearchQuery(BaseModel):
|
||||
q: str
|
||||
folder: Optional[str] = None
|
||||
tags: Optional[list[str]] = None
|
||||
type: Optional[str] = None
|
||||
limit: int = Field(default=20, le=100)
|
||||
offset: int = 0
|
||||
|
||||
|
||||
class SemanticSearchQuery(BaseModel):
|
||||
q: str
|
||||
folder: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
limit: int = Field(default=20, le=100)
|
||||
|
||||
|
||||
class HybridSearchQuery(BaseModel):
|
||||
q: str
|
||||
folder: Optional[str] = None
|
||||
tags: Optional[list[str]] = None
|
||||
type: Optional[str] = None
|
||||
limit: int = Field(default=20, le=100)
|
||||
|
||||
|
||||
# ── Response schemas ──
|
||||
|
||||
class AssetOut(BaseModel):
|
||||
id: str
|
||||
asset_type: str
|
||||
filename: str
|
||||
content_type: Optional[str] = None
|
||||
size_bytes: Optional[int] = None
|
||||
created_at: datetime
|
||||
|
||||
model_config = {"from_attributes": True}
|
||||
|
||||
|
||||
class ItemOut(BaseModel):
|
||||
id: str
|
||||
type: str
|
||||
title: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
folder: Optional[str] = None
|
||||
tags: Optional[list[str]] = None
|
||||
summary: Optional[str] = None
|
||||
confidence: Optional[float] = None
|
||||
processing_status: str
|
||||
processing_error: Optional[str] = None
|
||||
metadata_json: Optional[dict] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
assets: list[AssetOut] = []
|
||||
|
||||
model_config = {"from_attributes": True}
|
||||
|
||||
|
||||
class ItemList(BaseModel):
|
||||
items: list[ItemOut]
|
||||
total: int
|
||||
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
items: list[ItemOut]
|
||||
total: int
|
||||
query: str
|
||||
|
||||
|
||||
class ConfigOut(BaseModel):
|
||||
folders: list[str]
|
||||
tags: list[str]
|
||||
|
||||
|
||||
# ── OpenAI classification schema ──
|
||||
|
||||
class ClassificationResult(BaseModel):
|
||||
"""What the AI returns for each item."""
|
||||
folder: str
|
||||
tags: list[str] = Field(min_length=2, max_length=3)
|
||||
title: str
|
||||
summary: str
|
||||
confidence: float = Field(ge=0.0, le=1.0)
|
||||
0
services/brain/app/search/__init__.py
Normal file
0
services/brain/app/search/__init__.py
Normal file
183
services/brain/app/search/engine.py
Normal file
183
services/brain/app/search/engine.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""Search engine — Meilisearch for keywords, pgvector for semantic, hybrid merges both."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import MEILI_URL, MEILI_KEY, MEILI_INDEX, OPENAI_EMBED_DIM
|
||||
from app.models.item import Item
|
||||
from app.services.embed import generate_embedding
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── Meilisearch helpers ──
|
||||
|
||||
async def _meili_request(method: str, path: str, json_data: dict = None) -> dict | None:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
resp = await client.request(
|
||||
method,
|
||||
f"{MEILI_URL}/{path}",
|
||||
json=json_data,
|
||||
headers={"Authorization": f"Bearer {MEILI_KEY}"},
|
||||
)
|
||||
if resp.status_code < 300:
|
||||
return resp.json() if resp.content else {}
|
||||
log.warning(f"Meilisearch {method} {path}: {resp.status_code}")
|
||||
except Exception as e:
|
||||
log.error(f"Meilisearch error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def ensure_meili_index():
|
||||
"""Create the Meilisearch index if it doesn't exist."""
|
||||
await _meili_request("POST", "indexes", {"uid": MEILI_INDEX, "primaryKey": "id"})
|
||||
# Set filterable attributes
|
||||
await _meili_request("PUT", f"indexes/{MEILI_INDEX}/settings", {
|
||||
"filterableAttributes": ["user_id", "folder", "tags", "type", "processing_status"],
|
||||
"searchableAttributes": ["title", "extracted_text", "summary", "url"],
|
||||
"sortableAttributes": ["created_at"],
|
||||
})
|
||||
|
||||
|
||||
async def index_item(item_data: dict):
|
||||
"""Add or update an item in Meilisearch."""
|
||||
await _meili_request("POST", f"indexes/{MEILI_INDEX}/documents", [item_data])
|
||||
|
||||
|
||||
async def remove_from_index(item_id: str):
|
||||
"""Remove an item from Meilisearch."""
|
||||
await _meili_request("DELETE", f"indexes/{MEILI_INDEX}/documents/{item_id}")
|
||||
|
||||
|
||||
# ── Keyword search (Meilisearch) ──
|
||||
|
||||
async def keyword_search(
|
||||
user_id: str,
|
||||
q: str,
|
||||
folder: str | None = None,
|
||||
tags: list[str] | None = None,
|
||||
item_type: str | None = None,
|
||||
limit: int = 20,
|
||||
offset: int = 0,
|
||||
) -> tuple[list[str], int]:
|
||||
"""Search Meilisearch. Returns (item_ids, total)."""
|
||||
filters = [f'user_id = "{user_id}"', 'processing_status = "ready"']
|
||||
if folder:
|
||||
filters.append(f'folder = "{folder}"')
|
||||
if item_type:
|
||||
filters.append(f'type = "{item_type}"')
|
||||
if tags:
|
||||
for tag in tags:
|
||||
filters.append(f'tags = "{tag}"')
|
||||
|
||||
result = await _meili_request("POST", f"indexes/{MEILI_INDEX}/search", {
|
||||
"q": q,
|
||||
"filter": " AND ".join(filters),
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
})
|
||||
|
||||
if not result:
|
||||
return [], 0
|
||||
|
||||
ids = [hit["id"] for hit in result.get("hits", [])]
|
||||
total = result.get("estimatedTotalHits", len(ids))
|
||||
return ids, total
|
||||
|
||||
|
||||
# ── Semantic search (pgvector) ──
|
||||
|
||||
async def vector_search(
|
||||
db: AsyncSession,
|
||||
user_id: str,
|
||||
q: str,
|
||||
folder: str | None = None,
|
||||
item_type: str | None = None,
|
||||
limit: int = 20,
|
||||
) -> list:
|
||||
"""Semantic similarity search using pgvector cosine distance."""
|
||||
query_embedding = await generate_embedding(q)
|
||||
if not query_embedding:
|
||||
return []
|
||||
|
||||
embedding_str = "[" + ",".join(str(x) for x in query_embedding) + "]"
|
||||
|
||||
filters = ["i.user_id = :user_id", "i.processing_status = 'ready'", "i.embedding IS NOT NULL"]
|
||||
params = {"user_id": user_id, "limit": limit}
|
||||
|
||||
if folder:
|
||||
filters.append("i.folder = :folder")
|
||||
params["folder"] = folder
|
||||
if item_type:
|
||||
filters.append("i.type = :item_type")
|
||||
params["item_type"] = item_type
|
||||
|
||||
where = " AND ".join(filters)
|
||||
|
||||
sql = text(f"""
|
||||
SELECT i.id, i.embedding <=> '{embedding_str}'::vector AS distance
|
||||
FROM items i
|
||||
WHERE {where}
|
||||
ORDER BY distance ASC
|
||||
LIMIT :limit
|
||||
""")
|
||||
|
||||
result = await db.execute(sql, params)
|
||||
rows = result.fetchall()
|
||||
item_ids = [row[0] for row in rows]
|
||||
|
||||
if not item_ids:
|
||||
return []
|
||||
|
||||
items_result = await db.execute(
|
||||
select(Item).options(selectinload(Item.assets))
|
||||
.where(Item.id.in_(item_ids))
|
||||
)
|
||||
items_map = {i.id: i for i in items_result.scalars().all()}
|
||||
return [items_map[id] for id in item_ids if id in items_map]
|
||||
|
||||
|
||||
# ── Hybrid search ──
|
||||
|
||||
async def hybrid_search(
|
||||
db: AsyncSession,
|
||||
user_id: str,
|
||||
q: str,
|
||||
folder: str | None = None,
|
||||
tags: list[str] | None = None,
|
||||
item_type: str | None = None,
|
||||
limit: int = 20,
|
||||
) -> list:
|
||||
"""Merge keyword + semantic results using reciprocal rank fusion."""
|
||||
# Keyword results
|
||||
kw_ids, _ = await keyword_search(user_id, q, folder, tags, item_type, limit=limit * 2)
|
||||
# Semantic results
|
||||
sem_items = await vector_search(db, user_id, q, folder, item_type, limit=limit * 2)
|
||||
sem_ids = [i.id for i in sem_items]
|
||||
|
||||
# Reciprocal rank fusion
|
||||
scores: dict[str, float] = {}
|
||||
k = 60 # RRF constant
|
||||
for rank, id in enumerate(kw_ids):
|
||||
scores[id] = scores.get(id, 0) + 1.0 / (k + rank)
|
||||
for rank, id in enumerate(sem_ids):
|
||||
scores[id] = scores.get(id, 0) + 1.0 / (k + rank)
|
||||
|
||||
# Sort by combined score
|
||||
merged_ids = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:limit]
|
||||
|
||||
if not merged_ids:
|
||||
return []
|
||||
|
||||
result = await db.execute(
|
||||
select(Item).options(selectinload(Item.assets))
|
||||
.where(Item.id.in_(merged_ids))
|
||||
)
|
||||
items_map = {i.id: i for i in result.scalars().all()}
|
||||
return [items_map[id] for id in merged_ids if id in items_map]
|
||||
0
services/brain/app/services/__init__.py
Normal file
0
services/brain/app/services/__init__.py
Normal file
125
services/brain/app/services/classify.py
Normal file
125
services/brain/app/services/classify.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""OpenAI classification — structured output for folder/tags/title/summary."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import OPENAI_API_KEY, OPENAI_MODEL, FOLDERS, TAGS
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
SYSTEM_PROMPT = f"""You are a classification engine for a personal "second brain" knowledge management system.
|
||||
|
||||
Given an item (URL, note, document, or file), you must return structured JSON with:
|
||||
- folder: exactly 1 from this list: {json.dumps(FOLDERS)}
|
||||
- tags: exactly 2 or 3 from this list: {json.dumps(TAGS)}
|
||||
- title: a concise, normalized title (max 80 chars)
|
||||
- summary: a 1-2 sentence summary of the content
|
||||
- confidence: a float 0.0-1.0 indicating how confident you are
|
||||
|
||||
Rules:
|
||||
- NEVER invent folders or tags not in the lists above
|
||||
- NEVER skip classification
|
||||
- NEVER return freeform text outside the schema
|
||||
- Always return valid JSON matching the schema exactly"""
|
||||
|
||||
RESPONSE_SCHEMA = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "classification",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"folder": {"type": "string", "enum": FOLDERS},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {"type": "string", "enum": TAGS},
|
||||
"minItems": 2,
|
||||
"maxItems": 3,
|
||||
},
|
||||
"title": {"type": "string"},
|
||||
"summary": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
},
|
||||
"required": ["folder", "tags", "title", "summary", "confidence"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def build_user_prompt(item_type: str, url: str | None, title: str | None, text: str | None) -> str:
|
||||
parts = [f"Item type: {item_type}"]
|
||||
if url:
|
||||
parts.append(f"URL: {url}")
|
||||
if title:
|
||||
parts.append(f"Original title: {title}")
|
||||
if text:
|
||||
# Truncate to ~4000 chars for context window efficiency
|
||||
truncated = text[:4000]
|
||||
parts.append(f"Content:\n{truncated}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
async def classify_item(
|
||||
item_type: str,
|
||||
url: str | None = None,
|
||||
title: str | None = None,
|
||||
text: str | None = None,
|
||||
retries: int = 2,
|
||||
) -> dict:
|
||||
"""Call OpenAI to classify an item. Returns dict with folder, tags, title, summary, confidence."""
|
||||
if not OPENAI_API_KEY:
|
||||
log.warning("No OPENAI_API_KEY set, returning defaults")
|
||||
return {
|
||||
"folder": "Knowledge",
|
||||
"tags": ["reference", "read-later"],
|
||||
"title": title or "Untitled",
|
||||
"summary": "No AI classification available",
|
||||
"confidence": 0.0,
|
||||
}
|
||||
|
||||
user_msg = build_user_prompt(item_type, url, title, text)
|
||||
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
|
||||
json={
|
||||
"model": OPENAI_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_msg},
|
||||
],
|
||||
"response_format": RESPONSE_SCHEMA,
|
||||
"temperature": 0.2,
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
result = json.loads(content)
|
||||
|
||||
# Validate folder and tags are in allowed sets
|
||||
if result["folder"] not in FOLDERS:
|
||||
result["folder"] = "Knowledge"
|
||||
result["tags"] = [t for t in result["tags"] if t in TAGS][:3]
|
||||
if len(result["tags"]) < 2:
|
||||
result["tags"] = (result["tags"] + ["reference", "read-later"])[:3]
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Classification attempt {attempt + 1} failed: {e}")
|
||||
if attempt == retries:
|
||||
return {
|
||||
"folder": "Knowledge",
|
||||
"tags": ["reference", "read-later"],
|
||||
"title": title or "Untitled",
|
||||
"summary": f"Classification failed: {e}",
|
||||
"confidence": 0.0,
|
||||
}
|
||||
36
services/brain/app/services/embed.py
Normal file
36
services/brain/app/services/embed.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""Embedding generation via OpenAI text-embedding API."""
|
||||
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import OPENAI_API_KEY, OPENAI_EMBED_MODEL, OPENAI_EMBED_DIM
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def generate_embedding(text: str) -> list[float] | None:
|
||||
"""Generate a vector embedding for the given text. Returns list of floats or None on failure."""
|
||||
if not OPENAI_API_KEY or not text.strip():
|
||||
return None
|
||||
|
||||
# Truncate to ~8000 chars for embedding model token limit
|
||||
truncated = text[:8000]
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=20) as client:
|
||||
resp = await client.post(
|
||||
"https://api.openai.com/v1/embeddings",
|
||||
headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
|
||||
json={
|
||||
"model": OPENAI_EMBED_MODEL,
|
||||
"input": truncated,
|
||||
"dimensions": OPENAI_EMBED_DIM,
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["data"][0]["embedding"]
|
||||
except Exception as e:
|
||||
log.error(f"Embedding generation failed: {e}")
|
||||
return None
|
||||
164
services/brain/app/services/ingest.py
Normal file
164
services/brain/app/services/ingest.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""Content ingestion — fetch, extract, screenshot, archive."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from html.parser import HTMLParser
|
||||
from io import StringIO
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import BROWSERLESS_URL
|
||||
from app.services.storage import storage
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _HTMLTextExtractor(HTMLParser):
|
||||
"""Simple HTML to text converter."""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._result = StringIO()
|
||||
self._skip = False
|
||||
self._skip_tags = {"script", "style", "noscript", "svg"}
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in self._skip_tags:
|
||||
self._skip = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in self._skip_tags:
|
||||
self._skip = False
|
||||
if tag in ("p", "div", "br", "h1", "h2", "h3", "h4", "li", "tr"):
|
||||
self._result.write("\n")
|
||||
|
||||
def handle_data(self, data):
|
||||
if not self._skip:
|
||||
self._result.write(data)
|
||||
|
||||
def get_text(self) -> str:
|
||||
raw = self._result.getvalue()
|
||||
# Collapse whitespace
|
||||
lines = [line.strip() for line in raw.splitlines()]
|
||||
return "\n".join(line for line in lines if line)
|
||||
|
||||
|
||||
def html_to_text(html: str) -> str:
|
||||
extractor = _HTMLTextExtractor()
|
||||
extractor.feed(html)
|
||||
return extractor.get_text()
|
||||
|
||||
|
||||
def extract_title_from_html(html: str) -> str | None:
|
||||
match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
|
||||
def extract_meta_description(html: str) -> str | None:
|
||||
match = re.search(
|
||||
r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
|
||||
html, re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
|
||||
async def fetch_url_content(url: str) -> dict:
|
||||
"""Fetch URL content. Returns dict with html, text, title, description, used_browserless."""
|
||||
result = {"html": None, "text": None, "title": None, "description": None, "used_browserless": False}
|
||||
|
||||
# Try HTTP-first extraction
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
|
||||
resp = await client.get(url, headers={
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
|
||||
})
|
||||
resp.raise_for_status()
|
||||
html = resp.text
|
||||
result["html"] = html
|
||||
result["text"] = html_to_text(html)
|
||||
result["title"] = extract_title_from_html(html)
|
||||
result["description"] = extract_meta_description(html)
|
||||
|
||||
# If extraction is weak (< 200 chars of text), try browserless
|
||||
if len(result["text"] or "") < 200:
|
||||
log.info(f"Weak extraction ({len(result['text'] or '')} chars), trying browserless")
|
||||
br = await fetch_with_browserless(url)
|
||||
if br and len(br.get("text", "")) > len(result["text"] or ""):
|
||||
result.update(br)
|
||||
result["used_browserless"] = True
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"HTTP fetch failed for {url}: {e}, trying browserless")
|
||||
try:
|
||||
br = await fetch_with_browserless(url)
|
||||
if br:
|
||||
result.update(br)
|
||||
result["used_browserless"] = True
|
||||
except Exception as e2:
|
||||
log.error(f"Browserless also failed for {url}: {e2}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def fetch_with_browserless(url: str) -> dict | None:
|
||||
"""Use browserless/chrome to render JS-heavy pages."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{BROWSERLESS_URL}/content",
|
||||
json={"url": url, "waitForTimeout": 3000},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
return {
|
||||
"html": html,
|
||||
"text": html_to_text(html),
|
||||
"title": extract_title_from_html(html),
|
||||
"description": extract_meta_description(html),
|
||||
}
|
||||
except Exception as e:
|
||||
log.error(f"Browserless fetch failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def take_screenshot(url: str, item_id: str) -> str | None:
|
||||
"""Take a screenshot of a URL using browserless. Returns storage path or None."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{BROWSERLESS_URL}/screenshot",
|
||||
json={
|
||||
"url": url,
|
||||
"options": {"type": "png", "fullPage": False},
|
||||
"waitForTimeout": 3000,
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
path = storage.save(
|
||||
item_id=item_id,
|
||||
asset_type="screenshot",
|
||||
filename="screenshot.png",
|
||||
data=resp.content,
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"Screenshot failed for {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def archive_html(html: str, item_id: str) -> str | None:
|
||||
"""Save the full HTML as an archived asset."""
|
||||
if not html:
|
||||
return None
|
||||
try:
|
||||
path = storage.save(
|
||||
item_id=item_id,
|
||||
asset_type="archived_html",
|
||||
filename="page.html",
|
||||
data=html.encode("utf-8"),
|
||||
)
|
||||
return path
|
||||
except Exception as e:
|
||||
log.error(f"HTML archive failed: {e}")
|
||||
return None
|
||||
81
services/brain/app/services/storage.py
Normal file
81
services/brain/app/services/storage.py
Normal file
@@ -0,0 +1,81 @@
|
||||
"""File storage abstraction — local disk first, S3-ready interface."""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
from app.config import STORAGE_BACKEND, STORAGE_LOCAL_PATH
|
||||
|
||||
|
||||
class StorageBackend(ABC):
|
||||
@abstractmethod
|
||||
def save(self, item_id: str, asset_type: str, filename: str, data: bytes) -> str:
|
||||
"""Save file, return relative storage path."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def read(self, path: str) -> bytes:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def delete(self, path: str) -> None:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def exists(self, path: str) -> bool:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def url(self, path: str) -> str:
|
||||
"""Return a URL or local path for serving."""
|
||||
...
|
||||
|
||||
|
||||
class LocalStorage(StorageBackend):
|
||||
def __init__(self, base_path: str):
|
||||
self.base = Path(base_path)
|
||||
self.base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _full_path(self, path: str) -> Path:
|
||||
return self.base / path
|
||||
|
||||
def save(self, item_id: str, asset_type: str, filename: str, data: bytes) -> str:
|
||||
rel = f"{item_id}/{asset_type}/{filename}"
|
||||
full = self._full_path(rel)
|
||||
full.parent.mkdir(parents=True, exist_ok=True)
|
||||
full.write_bytes(data)
|
||||
return rel
|
||||
|
||||
def read(self, path: str) -> bytes:
|
||||
return self._full_path(path).read_bytes()
|
||||
|
||||
def delete(self, path: str) -> None:
|
||||
full = self._full_path(path)
|
||||
if full.exists():
|
||||
full.unlink()
|
||||
# Clean empty parent dirs
|
||||
parent = full.parent
|
||||
while parent != self.base:
|
||||
try:
|
||||
parent.rmdir()
|
||||
parent = parent.parent
|
||||
except OSError:
|
||||
break
|
||||
|
||||
def exists(self, path: str) -> bool:
|
||||
return self._full_path(path).exists()
|
||||
|
||||
def url(self, path: str) -> str:
|
||||
return f"/storage/{path}"
|
||||
|
||||
|
||||
# Future: S3Storage class implementing the same interface
|
||||
|
||||
def _create_storage() -> StorageBackend:
|
||||
if STORAGE_BACKEND == "local":
|
||||
return LocalStorage(STORAGE_LOCAL_PATH)
|
||||
raise ValueError(f"Unknown storage backend: {STORAGE_BACKEND}")
|
||||
|
||||
|
||||
storage = _create_storage()
|
||||
0
services/brain/app/worker/__init__.py
Normal file
0
services/brain/app/worker/__init__.py
Normal file
156
services/brain/app/worker/tasks.py
Normal file
156
services/brain/app/worker/tasks.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""Background worker tasks — processes items after creation."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from redis import Redis
|
||||
from rq import Queue
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import REDIS_URL, DATABASE_URL_SYNC
|
||||
from app.models.item import Item, ItemAsset
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# RQ queue
|
||||
_redis = Redis.from_url(REDIS_URL)
|
||||
queue = Queue("brain", connection=_redis)
|
||||
|
||||
|
||||
def enqueue_process_item(item_id: str):
|
||||
"""Enqueue a background job to process an item."""
|
||||
queue.enqueue(process_item_job, item_id, job_timeout=300)
|
||||
|
||||
|
||||
def process_item_job(item_id: str):
|
||||
"""Synchronous entry point for RQ — runs the async pipeline."""
|
||||
asyncio.run(_process_item(item_id))
|
||||
|
||||
|
||||
async def _process_item(item_id: str):
|
||||
"""Full processing pipeline for a saved item."""
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
||||
from app.config import DATABASE_URL
|
||||
from app.services.ingest import fetch_url_content, take_screenshot, archive_html
|
||||
from app.services.classify import classify_item
|
||||
from app.services.embed import generate_embedding
|
||||
from app.search.engine import index_item, ensure_meili_index
|
||||
|
||||
engine = create_async_engine(DATABASE_URL, echo=False)
|
||||
Session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
||||
|
||||
async with Session() as db:
|
||||
# Load item
|
||||
result = await db.execute(
|
||||
select(Item).options(selectinload(Item.assets)).where(Item.id == item_id)
|
||||
)
|
||||
item = result.scalar_one_or_none()
|
||||
if not item:
|
||||
log.error(f"Item {item_id} not found")
|
||||
return
|
||||
|
||||
try:
|
||||
item.processing_status = "processing"
|
||||
await db.commit()
|
||||
|
||||
extracted_text = item.raw_content or ""
|
||||
title = item.title
|
||||
html_content = None
|
||||
|
||||
# ── Step 1: Fetch content for URLs ──
|
||||
if item.type == "link" and item.url:
|
||||
log.info(f"Fetching URL: {item.url}")
|
||||
content = await fetch_url_content(item.url)
|
||||
html_content = content.get("html")
|
||||
extracted_text = content.get("text") or extracted_text
|
||||
if not title:
|
||||
title = content.get("title")
|
||||
item.metadata_json = item.metadata_json or {}
|
||||
item.metadata_json["description"] = content.get("description")
|
||||
item.metadata_json["used_browserless"] = content.get("used_browserless", False)
|
||||
|
||||
# Take screenshot
|
||||
screenshot_path = await take_screenshot(item.url, item.id)
|
||||
if screenshot_path:
|
||||
asset = ItemAsset(
|
||||
id=str(uuid.uuid4()),
|
||||
item_id=item.id,
|
||||
asset_type="screenshot",
|
||||
filename="screenshot.png",
|
||||
content_type="image/png",
|
||||
storage_path=screenshot_path,
|
||||
)
|
||||
db.add(asset)
|
||||
|
||||
# Archive HTML
|
||||
if html_content:
|
||||
html_path = await archive_html(html_content, item.id)
|
||||
if html_path:
|
||||
asset = ItemAsset(
|
||||
id=str(uuid.uuid4()),
|
||||
item_id=item.id,
|
||||
asset_type="archived_html",
|
||||
filename="page.html",
|
||||
content_type="text/html",
|
||||
storage_path=html_path,
|
||||
)
|
||||
db.add(asset)
|
||||
|
||||
# ── Step 2: AI classification ──
|
||||
log.info(f"Classifying item {item.id}")
|
||||
classification = await classify_item(
|
||||
item_type=item.type,
|
||||
url=item.url,
|
||||
title=title,
|
||||
text=extracted_text,
|
||||
)
|
||||
|
||||
item.title = classification.get("title") or title or "Untitled"
|
||||
item.folder = classification.get("folder", "Knowledge")
|
||||
item.tags = classification.get("tags", ["reference", "read-later"])
|
||||
item.summary = classification.get("summary")
|
||||
item.confidence = classification.get("confidence", 0.0)
|
||||
item.extracted_text = extracted_text
|
||||
|
||||
# ── Step 3: Generate embedding ──
|
||||
log.info(f"Generating embedding for item {item.id}")
|
||||
embed_text = f"{item.title or ''}\n{item.summary or ''}\n{extracted_text}"
|
||||
embedding = await generate_embedding(embed_text)
|
||||
if embedding:
|
||||
item.embedding = embedding
|
||||
|
||||
# ── Step 4: Update status ──
|
||||
item.processing_status = "ready"
|
||||
item.updated_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
|
||||
# ── Step 5: Index in Meilisearch ──
|
||||
log.info(f"Indexing item {item.id} in Meilisearch")
|
||||
await ensure_meili_index()
|
||||
await index_item({
|
||||
"id": item.id,
|
||||
"user_id": item.user_id,
|
||||
"type": item.type,
|
||||
"title": item.title,
|
||||
"url": item.url,
|
||||
"folder": item.folder,
|
||||
"tags": item.tags or [],
|
||||
"summary": item.summary,
|
||||
"extracted_text": (extracted_text or "")[:10000], # Truncate for search index
|
||||
"processing_status": item.processing_status,
|
||||
"created_at": item.created_at.isoformat() if item.created_at else None,
|
||||
})
|
||||
|
||||
log.info(f"Item {item.id} processed successfully")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Processing failed for item {item.id}: {e}", exc_info=True)
|
||||
item.processing_status = "failed"
|
||||
item.processing_error = str(e)[:500]
|
||||
item.updated_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
|
||||
await engine.dispose()
|
||||
Reference in New Issue
Block a user