feat: major platform expansion — Brain service, RSS reader, iOS app, AI assistants, Firefox extension

Brain Service: - Playwright stealth crawler replacing browserless (og:image, Readability, Reddit JSON API) - AI classification with tag definitions and folder assignment - YouTube video download via yt-dlp - Karakeep migration complete (96 items) - Taxonomy management (folders with icons/colors, tags) - Discovery shuffle, sort options, search (Meilisearch + pgvector) - Item tag/folder editing, card color accents RSS Reader Service: - Custom FastAPI reader replacing Miniflux - Feed management (add/delete/refresh), category support - Full article extraction via Readability - Background content fetching for new entries - Mark all read with confirmation - Infinite scroll, retention cleanup (30/60 day) - 17 feeds migrated from Miniflux iOS App (SwiftUI): - Native iOS 17+ app with @Observable architecture - Cookie-based auth, configurable gateway URL - Dashboard with custom background photo + frosted glass widgets - Full fitness module (today/templates/goals/food library) - AI assistant chat (fitness + brain, raw JSON state management) - 120fps ProMotion support AI Assistants (Gateway): - Unified dispatcher with fitness/brain domain detection - Fitness: natural language food logging, photo analysis, multi-item splitting - Brain: save/append/update/delete notes, search & answer, undo support - Madiha user gets fitness-only (brain disabled) Firefox Extension: - One-click save to Brain from any page - Login with platform credentials - Right-click context menu (save page/link/image) - Notes field for URL saves - Signed and published on AMO Other: - Reader bookmark button routes to Brain (was Karakeep) - Fitness food library with "Add" button + add-to-meal popup - Kindle send file size check (25MB SMTP2GO limit) - Atelier UI as default (useAtelierShell=true) - Mobile upload box in nav drawer Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 00:56:29 -05:00
parent af1765bd8e
commit 4592e35732
97 changed files with 11009 additions and 532 deletions
--- a/services/brain/Dockerfile.worker
+++ b/services/brain/Dockerfile.worker
@@ -2,8 +2,8 @@ FROM python:3.12-slim

 WORKDIR /app

-RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev tesseract-ocr tesseract-ocr-eng && rm -rf /var/lib/apt/lists/*
-RUN pip install --no-cache-dir --upgrade pip
+RUN apt-get update && apt-get install -y --no-install-recommends libpq-dev tesseract-ocr tesseract-ocr-eng ffmpeg && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir --upgrade pip yt-dlp

 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
--- a/services/brain/app/api/routes.py
+++ b/services/brain/app/api/routes.py
@@ -13,10 +13,10 @@ from sqlalchemy.orm import selectinload

 from app.api.deps import get_user_id, get_db_session
 from app.config import FOLDERS, TAGS
-from app.models.item import Item, ItemAsset
+from app.models.item import Item, ItemAsset, ItemAddition
 from app.models.schema import (
    ItemCreate, ItemUpdate, ItemOut, ItemList, SearchQuery, SemanticSearchQuery,
-    HybridSearchQuery, SearchResult, ConfigOut,
+    HybridSearchQuery, SearchResult, ConfigOut, ItemAdditionCreate, ItemAdditionOut,
 )
 from app.services.storage import storage
 from fastapi.responses import Response
@@ -25,6 +25,46 @@ from app.worker.tasks import enqueue_process_item
 router = APIRouter(prefix="/api", tags=["brain"])


+async def refresh_item_search_state(db: AsyncSession, item: Item):
+    """Recompute embedding + Meilisearch doc after assistant additions change."""
+    from app.search.engine import index_item
+    from app.services.embed import generate_embedding
+
+    additions_result = await db.execute(
+        select(ItemAddition)
+        .where(ItemAddition.item_id == item.id, ItemAddition.user_id == item.user_id)
+        .order_by(ItemAddition.created_at.asc())
+    )
+    additions = additions_result.scalars().all()
+    additions_text = "\n\n".join(addition.content for addition in additions if addition.content.strip())
+
+    searchable_text_parts = [item.raw_content or "", item.extracted_text or "", additions_text]
+    searchable_text = "\n\n".join(part.strip() for part in searchable_text_parts if part and part.strip())
+
+    embed_text = f"{item.title or ''}\n{item.summary or ''}\n{searchable_text}".strip()
+    embedding = await generate_embedding(embed_text)
+    if embedding:
+        item.embedding = embedding
+
+    item.updated_at = datetime.utcnow()
+    await db.commit()
+    await db.refresh(item)
+
+    await index_item({
+        "id": item.id,
+        "user_id": item.user_id,
+        "type": item.type,
+        "title": item.title,
+        "url": item.url,
+        "folder": item.folder,
+        "tags": item.tags or [],
+        "summary": item.summary,
+        "extracted_text": searchable_text[:10000],
+        "processing_status": item.processing_status,
+        "created_at": item.created_at.isoformat() if item.created_at else None,
+    })
+
+
 # ── Health ──

@router.get("/health")
@@ -201,14 +241,31 @@ async def update_item(
        item.title = body.title
    if body.folder is not None:
        item.folder = body.folder
+        # Update folder_id FK
+        from app.models.taxonomy import Folder as FolderModel
+        folder_row = (await db.execute(
+            select(FolderModel).where(FolderModel.user_id == user_id, FolderModel.name == body.folder)
+        )).scalar_one_or_none()
+        item.folder_id = folder_row.id if folder_row else None
    if body.tags is not None:
        item.tags = body.tags
+        # Update item_tags relational entries
+        from app.models.taxonomy import Tag as TagModel, ItemTag
+        from sqlalchemy import delete as sa_delete
+        await db.execute(sa_delete(ItemTag).where(ItemTag.item_id == item.id))
+        for tag_name in body.tags:
+            tag_row = (await db.execute(
+                select(TagModel).where(TagModel.user_id == user_id, TagModel.name == tag_name)
+            )).scalar_one_or_none()
+            if tag_row:
+                db.add(ItemTag(item_id=item.id, tag_id=tag_row.id))
    if body.raw_content is not None:
        item.raw_content = body.raw_content

    item.updated_at = datetime.utcnow()
    await db.commit()
    await db.refresh(item)
+    await refresh_item_search_state(db, item)
    return item


@@ -238,6 +295,100 @@ async def delete_item(
    return {"status": "deleted"}


+@router.get("/items/{item_id}/additions", response_model=list[ItemAdditionOut])
+async def list_item_additions(
+    item_id: str,
+    user_id: str = Depends(get_user_id),
+    db: AsyncSession = Depends(get_db_session),
+):
+    item = (await db.execute(
+        select(Item).where(Item.id == item_id, Item.user_id == user_id)
+    )).scalar_one_or_none()
+    if not item:
+        raise HTTPException(status_code=404, detail="Item not found")
+
+    additions = (await db.execute(
+        select(ItemAddition)
+        .where(ItemAddition.item_id == item_id, ItemAddition.user_id == user_id)
+        .order_by(ItemAddition.created_at.asc())
+    )).scalars().all()
+    return additions
+
+
+@router.post("/items/{item_id}/additions", response_model=ItemAdditionOut, status_code=201)
+async def create_item_addition(
+    item_id: str,
+    body: ItemAdditionCreate,
+    user_id: str = Depends(get_user_id),
+    db: AsyncSession = Depends(get_db_session),
+):
+    item = (await db.execute(
+        select(Item).where(Item.id == item_id, Item.user_id == user_id)
+    )).scalar_one_or_none()
+    if not item:
+        raise HTTPException(status_code=404, detail="Item not found")
+
+    content = body.content.strip()
+    if not content:
+        raise HTTPException(status_code=400, detail="Addition content cannot be empty")
+
+    addition = ItemAddition(
+        id=str(uuid.uuid4()),
+        item_id=item.id,
+        user_id=user_id,
+        source=(body.source or "assistant").strip() or "assistant",
+        kind=(body.kind or "append").strip() or "append",
+        content=content,
+        metadata_json=body.metadata_json or {},
+    )
+    db.add(addition)
+    item.updated_at = datetime.utcnow()
+    await db.commit()
+    await db.refresh(addition)
+
+    result = await db.execute(
+        select(Item).where(Item.id == item.id, Item.user_id == user_id)
+    )
+    fresh_item = result.scalar_one()
+    await refresh_item_search_state(db, fresh_item)
+    return addition
+
+
+@router.delete("/items/{item_id}/additions/{addition_id}")
+async def delete_item_addition(
+    item_id: str,
+    addition_id: str,
+    user_id: str = Depends(get_user_id),
+    db: AsyncSession = Depends(get_db_session),
+):
+    item = (await db.execute(
+        select(Item).where(Item.id == item_id, Item.user_id == user_id)
+    )).scalar_one_or_none()
+    if not item:
+        raise HTTPException(status_code=404, detail="Item not found")
+
+    addition = (await db.execute(
+        select(ItemAddition).where(
+            ItemAddition.id == addition_id,
+            ItemAddition.item_id == item_id,
+            ItemAddition.user_id == user_id,
+        )
+    )).scalar_one_or_none()
+    if not addition:
+        raise HTTPException(status_code=404, detail="Addition not found")
+
+    await db.delete(addition)
+    item.updated_at = datetime.utcnow()
+    await db.commit()
+
+    result = await db.execute(
+        select(Item).where(Item.id == item.id, Item.user_id == user_id)
+    )
+    fresh_item = result.scalar_one()
+    await refresh_item_search_state(db, fresh_item)
+    return {"status": "deleted"}
+
+
 # ── Reprocess item ──

@router.post("/items/{item_id}/reprocess", response_model=ItemOut)
@@ -335,5 +486,7 @@ async def serve_asset(item_id: str, asset_type: str, filename: str):
    elif filename.endswith(".jpg") or filename.endswith(".jpeg"): ct = "image/jpeg"
    elif filename.endswith(".html"): ct = "text/html"
    elif filename.endswith(".pdf"): ct = "application/pdf"
+    elif filename.endswith(".mp4"): ct = "video/mp4"
+    elif filename.endswith(".webm"): ct = "video/webm"

    return Response(content=data, media_type=ct, headers={"Cache-Control": "public, max-age=3600"})
--- a/services/brain/app/config.py
+++ b/services/brain/app/config.py
@@ -17,8 +17,8 @@ MEILI_URL = os.environ.get("MEILI_URL", "http://brain-meili:7700")
 MEILI_KEY = os.environ.get("MEILI_MASTER_KEY", "brain-meili-key")
 MEILI_INDEX = "items"

-# ── Browserless ──
-BROWSERLESS_URL = os.environ.get("BROWSERLESS_URL", "http://brain-browserless:3000")
+# ── Crawler ──
+CRAWLER_URL = os.environ.get("CRAWLER_URL", "http://brain-crawler:3100")

 # ── OpenAI ──
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
@@ -42,14 +42,14 @@ DEBUG = os.environ.get("DEBUG", "").lower() in ("1", "true")

 # ── Classification rules ──
 FOLDERS = [
-    "Home", "Family", "Work", "Travel", "Knowledge", "Faith", "Projects"
+    "Home", "Family", "Work", "Travel", "Islam",
+    "Homelab", "Vanlife", "3D Printing", "Documents",
 ]

 TAGS = [
-    "reference", "important", "legal", "financial", "insurance",
-    "research", "idea", "guide", "tutorial", "setup", "how-to",
-    "tools", "dev", "server", "selfhosted", "home-assistant",
-    "shopping", "compare", "buy", "product",
-    "family", "kids", "health", "travel", "faith",
-    "video", "read-later", "books",
+    "diy", "reference", "home-assistant", "shopping", "video",
+    "tutorial", "server", "kids", "books", "travel",
+    "churning", "lawn-garden", "piracy", "work", "3d-printing",
+    "lectures", "vanlife", "yusuf", "madiha", "hafsa", "mustafa",
+    "medical", "legal", "vehicle", "insurance", "financial", "homeschool",
 ]
--- a/services/brain/app/main.py
+++ b/services/brain/app/main.py
@@ -31,7 +31,7 @@ app.include_router(taxonomy_router)
 async def startup():
    from sqlalchemy import text as sa_text
    from app.database import engine, Base
-    from app.models.item import Item, ItemAsset, AppLink  # noqa: import to register models
+    from app.models.item import Item, ItemAsset, AppLink, ItemAddition  # noqa: import to register models
    from app.models.taxonomy import Folder, Tag, ItemTag  # noqa: register taxonomy tables

    # Enable pgvector extension before creating tables
--- a/services/brain/app/models/item.py
+++ b/services/brain/app/models/item.py
@@ -45,6 +45,12 @@ class Item(Base):

    # Relationships
    assets = relationship("ItemAsset", back_populates="item", cascade="all, delete-orphan")
+    additions = relationship(
+        "ItemAddition",
+        back_populates="item",
+        cascade="all, delete-orphan",
+        order_by="ItemAddition.created_at",
+    )

    __table_args__ = (
        Index("ix_items_user_status", "user_id", "processing_status"),
@@ -79,3 +85,19 @@ class AppLink(Base):
    app = Column(String(64), nullable=False)       # trips|tasks|fitness|inventory
    app_entity_id = Column(String(128), nullable=False)
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+
+
+class ItemAddition(Base):
+    __tablename__ = "item_additions"
+
+    id = Column(UUID(as_uuid=False), primary_key=True, default=new_id)
+    item_id = Column(UUID(as_uuid=False), ForeignKey("items.id", ondelete="CASCADE"), nullable=False, index=True)
+    user_id = Column(String(64), nullable=False, index=True)
+    source = Column(String(32), nullable=False, default="assistant")  # assistant|manual
+    kind = Column(String(32), nullable=False, default="append")  # append
+    content = Column(Text, nullable=False)
+    metadata_json = Column(JSONB, nullable=True, default=dict)
+    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
+
+    item = relationship("Item", back_populates="additions")
--- a/services/brain/app/models/schema.py
+++ b/services/brain/app/models/schema.py
@@ -26,6 +26,13 @@ class ItemUpdate(BaseModel):
    raw_content: Optional[str] = None


+class ItemAdditionCreate(BaseModel):
+    content: str
+    source: Optional[str] = "assistant"
+    kind: Optional[str] = "append"
+    metadata_json: Optional[dict] = None
+
+
 class SearchQuery(BaseModel):
    q: str
    folder: Optional[str] = None
@@ -63,6 +70,19 @@ class AssetOut(BaseModel):
    model_config = {"from_attributes": True}


+class ItemAdditionOut(BaseModel):
+    id: str
+    item_id: str
+    source: str
+    kind: str
+    content: str
+    metadata_json: Optional[dict] = None
+    created_at: datetime
+    updated_at: datetime
+
+    model_config = {"from_attributes": True}
+
+
 class ItemOut(BaseModel):
    id: str
    type: str
--- a/services/brain/app/models/taxonomy.py
+++ b/services/brain/app/models/taxonomy.py
@@ -70,23 +70,24 @@ class ItemTag(Base):

 # Default folders with colors and icons
 DEFAULT_FOLDERS = [
-    {"name": "Home",      "color": "#059669", "icon": "home"},
-    {"name": "Family",    "color": "#D97706", "icon": "heart"},
-    {"name": "Work",      "color": "#4338CA", "icon": "briefcase"},
-    {"name": "Travel",    "color": "#0EA5E9", "icon": "plane"},
-    {"name": "Knowledge", "color": "#8B5CF6", "icon": "book-open"},
-    {"name": "Faith",     "color": "#10B981", "icon": "moon"},
-    {"name": "Projects",  "color": "#F43F5E", "icon": "folder"},
+    {"name": "Home",        "color": "#059669", "icon": "home"},
+    {"name": "Family",      "color": "#D97706", "icon": "heart"},
+    {"name": "Work",        "color": "#4338CA", "icon": "briefcase"},
+    {"name": "Travel",      "color": "#0EA5E9", "icon": "plane"},
+    {"name": "Islam",       "color": "#10B981", "icon": "moon"},
+    {"name": "Homelab",     "color": "#6366F1", "icon": "server"},
+    {"name": "Vanlife",     "color": "#F59E0B", "icon": "truck"},
+    {"name": "3D Printing", "color": "#EC4899", "icon": "printer"},
+    {"name": "Documents",   "color": "#78716C", "icon": "file-text"},
 ]

 # Default tags to seed for new users
 DEFAULT_TAGS = [
-    "reference", "important", "legal", "financial", "insurance",
-    "research", "idea", "guide", "tutorial", "setup", "how-to",
-    "tools", "dev", "server", "selfhosted", "home-assistant",
-    "shopping", "compare", "buy", "product",
-    "family", "kids", "health", "travel", "faith",
-    "video", "read-later", "books",
+    "diy", "reference", "home-assistant", "shopping", "video",
+    "tutorial", "server", "kids", "books", "travel",
+    "churning", "lawn-garden", "piracy", "work", "3d-printing",
+    "lectures", "vanlife", "yusuf", "madiha", "hafsa", "mustafa",
+    "medical", "legal", "vehicle", "insurance", "financial", "homeschool",
 ]


--- a/services/brain/app/services/classify.py
+++ b/services/brain/app/services/classify.py
@@ -9,20 +9,61 @@ from app.config import OPENAI_API_KEY, OPENAI_MODEL

 log = logging.getLogger(__name__)

+TAG_DEFINITIONS = {
+    "home-assistant": "Home Assistant specific content (dashboards, ESPHome, automations, integrations, Lovelace cards)",
+    "server": "Server/infrastructure content (Docker, backups, networking, self-hosted apps, Linux)",
+    "kids": "Anything related to children, parenting, or educational content for kids",
+    "shopping": "A product page, product review, or specific item you might want to buy (Amazon, stores, book reviews with purchase links). NOT general discussion threads or forums comparing many options.",
+    "diy": "Physical hands-on projects around the house, yard, or vehicle — repairs, woodworking, crafts, building things. NOT software, dashboards, or digital projects.",
+    "reference": "Lookup info like contacts, sizes, specs, measurements, settings to remember",
+    "video": "Video content (YouTube, TikTok, etc)",
+    "tutorial": "How-to guides, step-by-step instructions, learning content",
+    "books": "Book recommendations, reviews, or reading lists",
+    "travel": "Destinations, resorts, hotels, trip ideas, reviews, places to visit",
+    "churning": "Credit card points, miles, award travel, hotel loyalty programs, points maximization, sign-up bonuses",
+    "lawn-garden": "Lawn care, gardening, yard work, bug spraying, fertilizer, landscaping, plants, outdoor maintenance",
+    "piracy": "Anything to do with downloading content like Audiobooks, games",
+    "lectures": "Lecture notes, Islamic talks, sermon recordings, religious class notes",
+    "3d-printing": "3D printer files (STL), printer mods, filament, slicer settings, 3D printed objects and projects",
+    "work": "Work-related content",
+    "vanlife": "Van conversion, Promaster van, van build projects, camping in vans, van electrical/solar, van life lifestyle",
+    "yusuf": "Personal document belonging to family member Yusuf (look for name in title or content)",
+    "madiha": "Personal document belonging to family member Madiha (look for name in title or content)",
+    "hafsa": "Personal document belonging to family member Hafsa (look for name in title or content)",
+    "mustafa": "Personal document belonging to family member Mustafa (look for name in title or content)",
+    "medical": "Medical records, allergy results, prescriptions, lab work, vaccination records, doctor notes",
+    "legal": "Birth certificates, passports, IDs, citizenship papers, contracts, legal agreements",
+    "vehicle": "Car registration, license plates, insurance cards, vehicle titles, maintenance records",
+    "insurance": "Insurance policies, insurance cards, coverage documents, claims",
+    "financial": "Tax documents, bank statements, pay stubs, loan papers, credit reports",
+    "homeschool": "Homeschooling resources, curriculum, lesson plans, educational materials for teaching kids at home, school projects, science experiments",
+}
+

 def build_system_prompt(folders: list[str], tags: list[str]) -> str:
+    tag_defs = "\n".join(
+        f"  - '{t}': {TAG_DEFINITIONS[t]}" if t in TAG_DEFINITIONS else f"  - '{t}'"
+        for t in tags
+    )
    return f"""You are a classification engine for a personal "second brain" knowledge management system.

 Given an item (URL, note, document, or file), you must return structured JSON with:
 - folder: exactly 1 from this list: {json.dumps(folders)}
- tags: exactly 2 or 3 from this list: {json.dumps(tags)}
- title: a concise, normalized title (max 80 chars)
+- tags: ONLY from this predefined list. Do NOT create any new tags outside this list. If no tags fit, return an empty array.
+- title: a concise, normalized title in Title Case with spaces (max 80 chars, e.g. 'Machine Learning', 'Web Development')
 - summary: a 1-2 sentence summary of the content (for links/documents only)
 - corrected_text: for NOTES ONLY — return the original note text with spelling/grammar fixed. Keep the original meaning, tone, and structure. Only fix typos and obvious errors. Return empty string for non-notes.
 - confidence: a float 0.0-1.0 indicating how confident you are

+Tag definitions (only assign tags that STRONGLY match the content):
+{tag_defs}
+
 Rules:
 - NEVER invent folders or tags not in the lists above
+- Only assign tags that STRONGLY match the content. 1-2 tags is perfectly fine.
+- Do NOT pad with extra tags just to reach a target number. If only one tag fits, only use one.
+- If NO tags fit the content, return an empty tags array.
+- Name tags: 'yusuf', 'madiha', 'hafsa', or 'mustafa' ONLY when the content is a personal document belonging to that family member (look for their name in the title or content)
 - NEVER skip classification
 - NEVER return freeform text outside the schema
 - For notes: do NOT summarize. Keep the original text. Only fix spelling.
@@ -43,7 +84,7 @@ def build_response_schema(folders: list[str], tags: list[str]) -> dict:
                "tags": {
                    "type": "array",
                    "items": {"type": "string", "enum": tags},
-                    "minItems": 2,
+                    "minItems": 0,
                    "maxItems": 3,
                },
                "title": {"type": "string"},
@@ -88,8 +129,8 @@ async def classify_item(
    if not OPENAI_API_KEY:
        log.warning("No OPENAI_API_KEY set, returning defaults")
        return {
-            "folder": "Knowledge",
-            "tags": ["reference", "read-later"],
+            "folder": "Home",
+            "tags": ["reference"],
            "title": title or "Untitled",
            "summary": "No AI classification available",
            "confidence": 0.0,
@@ -122,10 +163,8 @@ async def classify_item(

                # Validate folder and tags are in allowed sets
                if result["folder"] not in folders:
-                    result["folder"] = folders[0] if folders else "Knowledge"
+                    result["folder"] = folders[0] if folders else "Home"
                result["tags"] = [t for t in result["tags"] if t in tags][:3]
-                if len(result["tags"]) < 2:
-                    result["tags"] = (result["tags"] + ["reference", "read-later"])[:3]

                return result

@@ -133,8 +172,8 @@ async def classify_item(
            log.error(f"Classification attempt {attempt + 1} failed: {e}")
            if attempt == retries:
                return {
-                    "folder": "Knowledge",
-                    "tags": ["reference", "read-later"],
+                    "folder": "Home",
+                    "tags": ["reference"],
                    "title": title or "Untitled",
                    "summary": f"Classification failed: {e}",
                    "confidence": 0.0,
--- a/services/brain/app/services/ingest.py
+++ b/services/brain/app/services/ingest.py
@@ -1,162 +1,218 @@
-"""Content ingestion — fetch, extract, screenshot, archive."""
+"""Content ingestion — Playwright crawler for HTML, screenshots, og:image."""

+import base64
 import logging
 import re
-import uuid
-from html.parser import HTMLParser
-from io import StringIO
 from urllib.parse import urlparse

 import httpx

-from app.config import BROWSERLESS_URL
+from app.config import CRAWLER_URL
 from app.services.storage import storage

 log = logging.getLogger(__name__)


-class _HTMLTextExtractor(HTMLParser):
-    """Simple HTML to text converter."""
-    def __init__(self):
-        super().__init__()
-        self._result = StringIO()
-        self._skip = False
-        self._skip_tags = {"script", "style", "noscript", "svg"}
+# ── YouTube helpers ──

-    def handle_starttag(self, tag, attrs):
-        if tag in self._skip_tags:
-            self._skip = True
-
-    def handle_endtag(self, tag):
-        if tag in self._skip_tags:
-            self._skip = False
-        if tag in ("p", "div", "br", "h1", "h2", "h3", "h4", "li", "tr"):
-            self._result.write("\n")
-
-    def handle_data(self, data):
-        if not self._skip:
-            self._result.write(data)
-
-    def get_text(self) -> str:
-        raw = self._result.getvalue()
-        # Collapse whitespace
-        lines = [line.strip() for line in raw.splitlines()]
-        return "\n".join(line for line in lines if line)
+def _extract_youtube_id(url: str) -> str | None:
+    patterns = [
+        r'(?:youtube\.com/watch\?.*v=|youtu\.be/|youtube\.com/shorts/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
+    ]
+    for pat in patterns:
+        m = re.search(pat, url)
+        if m:
+            return m.group(1)
+    return None


-def html_to_text(html: str) -> str:
-    extractor = _HTMLTextExtractor()
-    extractor.feed(html)
-    return extractor.get_text()
+def _is_youtube_url(url: str) -> bool:
+    return bool(_extract_youtube_id(url))


-def extract_title_from_html(html: str) -> str | None:
-    match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
-    return match.group(1).strip() if match else None
+async def fetch_youtube_metadata(url: str) -> dict | None:
+    """Fetch YouTube video metadata via oEmbed. No API key needed."""
+    video_id = _extract_youtube_id(url)
+    if not video_id:
+        return None

+    result = {
+        "title": None,
+        "description": None,
+        "author": None,
+        "thumbnail_url": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
+        "video_id": video_id,
+        "is_short": "/shorts/" in url,
+    }

-def extract_meta_description(html: str) -> str | None:
-    match = re.search(
-        r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
-        html, re.IGNORECASE | re.DOTALL,
-    )
-    return match.group(1).strip() if match else None
-
-
-async def fetch_url_content(url: str) -> dict:
-    """Fetch URL content. Returns dict with html, text, title, description, used_browserless."""
-    result = {"html": None, "text": None, "title": None, "description": None, "used_browserless": False}
-
-    # Try HTTP-first extraction
    try:
-        async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
-            resp = await client.get(url, headers={
-                "User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
-            })
-            resp.raise_for_status()
-            html = resp.text
-            result["html"] = html
-            result["text"] = html_to_text(html)
-            result["title"] = extract_title_from_html(html)
-            result["description"] = extract_meta_description(html)
-
-            # If extraction is weak (< 200 chars of text), try browserless
-            if len(result["text"] or "") < 200:
-                log.info(f"Weak extraction ({len(result['text'] or '')} chars), trying browserless")
-                br = await fetch_with_browserless(url)
-                if br and len(br.get("text", "")) > len(result["text"] or ""):
-                    result.update(br)
-                    result["used_browserless"] = True
+        async with httpx.AsyncClient(timeout=10) as client:
+            oembed_url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
+            resp = await client.get(oembed_url)
+            if resp.status_code == 200:
+                data = resp.json()
+                result["title"] = data.get("title")
+                result["author"] = data.get("author_name")

+            noembed_url = f"https://noembed.com/embed?url=https://www.youtube.com/watch?v={video_id}"
+            resp2 = await client.get(noembed_url)
+            if resp2.status_code == 200:
+                data2 = resp2.json()
+                if not result["title"]:
+                    result["title"] = data2.get("title")
+                if not result["author"]:
+                    result["author"] = data2.get("author_name")
    except Exception as e:
-        log.warning(f"HTTP fetch failed for {url}: {e}, trying browserless")
-        try:
-            br = await fetch_with_browserless(url)
-            if br:
-                result.update(br)
-                result["used_browserless"] = True
-        except Exception as e2:
-            log.error(f"Browserless also failed for {url}: {e2}")
+        log.warning(f"YouTube metadata fetch failed: {e}")

    return result


-async def fetch_with_browserless(url: str) -> dict | None:
-    """Use browserless/chrome to render JS-heavy pages."""
+async def download_youtube_thumbnail(url: str, item_id: str) -> str | None:
+    """Download YouTube thumbnail and save as screenshot asset."""
+    video_id = _extract_youtube_id(url)
+    if not video_id:
+        return None
+
+    urls_to_try = [
+        f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
+        f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg",
+    ]
    try:
-        async with httpx.AsyncClient(timeout=30) as client:
-            resp = await client.post(
-                f"{BROWSERLESS_URL}/content",
-                json={"url": url, "waitForTimeout": 3000},
-            )
-            if resp.status_code == 200:
-                html = resp.text
-                return {
-                    "html": html,
-                    "text": html_to_text(html),
-                    "title": extract_title_from_html(html),
-                    "description": extract_meta_description(html),
-                }
+        async with httpx.AsyncClient(timeout=10) as client:
+            for thumb_url in urls_to_try:
+                resp = await client.get(thumb_url)
+                if resp.status_code == 200 and len(resp.content) > 1000:
+                    path = storage.save(
+                        item_id=item_id, asset_type="screenshot",
+                        filename="thumbnail.jpg", data=resp.content,
+                    )
+                    return path
    except Exception as e:
-        log.error(f"Browserless fetch failed: {e}")
+        log.warning(f"YouTube thumbnail download failed: {e}")
    return None


-async def take_screenshot(url: str, item_id: str) -> str | None:
-    """Take a screenshot of a URL using browserless. Returns storage path or None."""
-    try:
-        async with httpx.AsyncClient(timeout=30) as client:
-            resp = await client.post(
-                f"{BROWSERLESS_URL}/screenshot",
-                json={
-                    "url": url,
-                    "options": {"type": "png", "fullPage": False},
-                    "waitForTimeout": 3000,
-                },
+async def download_youtube_video(url: str, item_id: str) -> tuple[str | None, dict]:
+    """Download YouTube video via yt-dlp."""
+    import asyncio
+    import subprocess
+    import tempfile
+    import os
+
+    video_id = _extract_youtube_id(url)
+    if not video_id:
+        return None, {}
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        outpath = os.path.join(tmpdir, "%(id)s.%(ext)s")
+        cmd = [
+            "yt-dlp", "--no-playlist",
+            "-f", "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best[height<=720]",
+            "--merge-output-format", "mp4",
+            "--write-info-json", "--no-write-playlist-metafiles",
+            "-o", outpath, url,
+        ]
+        try:
+            proc = await asyncio.to_thread(
+                subprocess.run, cmd, capture_output=True, text=True, timeout=120,
            )
+            if proc.returncode != 0:
+                log.warning(f"yt-dlp failed: {proc.stderr[:300]}")
+                return None, {}
+
+            video_file = None
+            info = {}
+            for f in os.listdir(tmpdir):
+                if f.endswith(".mp4"):
+                    video_file = os.path.join(tmpdir, f)
+                elif f.endswith(".info.json"):
+                    import json as _json
+                    with open(os.path.join(tmpdir, f)) as fh:
+                        info = _json.load(fh)
+
+            if not video_file:
+                return None, {}
+
+            file_data = open(video_file, "rb").read()
+            path = storage.save(
+                item_id=item_id, asset_type="video",
+                filename=f"{video_id}.mp4", data=file_data,
+            )
+            log.info(f"Downloaded YouTube video: {len(file_data)} bytes -> {path}")
+            return path, info
+        except subprocess.TimeoutExpired:
+            log.warning(f"yt-dlp timed out for {url}")
+            return None, {}
+        except Exception as e:
+            log.error(f"YouTube download failed: {e}")
+            return None, {}
+
+
+# ── Main crawler (Playwright stealth service) ──
+
+async def crawl_url(url: str) -> dict:
+    """Call the Playwright crawler service. Returns dict with html, text, title,
+    description, author, og_image_url, screenshot (base64), status_code, error."""
+    try:
+        async with httpx.AsyncClient(timeout=45) as client:
+            resp = await client.post(f"{CRAWLER_URL}/crawl", json={"url": url})
            if resp.status_code == 200:
+                return resp.json()
+            log.warning(f"Crawler returned {resp.status_code} for {url}")
+    except Exception as e:
+        log.error(f"Crawler request failed for {url}: {e}")
+    return {"url": url, "html": None, "text": None, "title": None,
+            "description": None, "og_image_url": None, "screenshot": None, "error": str(e) if 'e' in dir() else "unknown"}
+
+
+async def save_screenshot_from_base64(b64: str, item_id: str) -> str | None:
+    """Decode base64 screenshot and save to storage."""
+    try:
+        data = base64.b64decode(b64)
+        if len(data) < 500:
+            return None
+        path = storage.save(
+            item_id=item_id, asset_type="screenshot",
+            filename="screenshot.jpg", data=data,
+        )
+        return path
+    except Exception as e:
+        log.error(f"Screenshot save failed: {e}")
+    return None
+
+
+async def download_og_image(og_url: str, item_id: str) -> str | None:
+    """Download an og:image and save as asset."""
+    # Clean HTML entities from URL
+    og_url = og_url.replace("&amp;", "&")
+    try:
+        async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
+            resp = await client.get(og_url, headers={
+                "User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
+            })
+            if resp.status_code == 200 and len(resp.content) > 1000:
+                ct = resp.headers.get("content-type", "image/jpeg")
+                ext = "png" if "png" in ct else "jpg"
                path = storage.save(
-                    item_id=item_id,
-                    asset_type="screenshot",
-                    filename="screenshot.png",
-                    data=resp.content,
+                    item_id=item_id, asset_type="og_image",
+                    filename=f"og_image.{ext}", data=resp.content,
                )
+                log.info(f"Downloaded og:image ({len(resp.content)} bytes) for {item_id}")
                return path
    except Exception as e:
-        log.error(f"Screenshot failed for {url}: {e}")
+        log.warning(f"og:image download failed: {e}")
    return None


 async def archive_html(html: str, item_id: str) -> str | None:
-    """Save the full HTML as an archived asset."""
+    """Save full HTML as an archived asset."""
    if not html:
        return None
    try:
        path = storage.save(
-            item_id=item_id,
-            asset_type="archived_html",
-            filename="page.html",
-            data=html.encode("utf-8"),
+            item_id=item_id, asset_type="archived_html",
+            filename="page.html", data=html.encode("utf-8"),
        )
        return path
    except Exception as e:
--- a/services/brain/app/worker/tasks.py
+++ b/services/brain/app/worker/tasks.py
@@ -12,6 +12,7 @@ from sqlalchemy.orm import selectinload

 from app.config import REDIS_URL, DATABASE_URL_SYNC
 from app.models.item import Item, ItemAsset
+from app.models.taxonomy import Folder, Tag, ItemTag  # noqa: F401 — register FK targets

 log = logging.getLogger(__name__)

@@ -34,7 +35,7 @@ async def _process_item(item_id: str):
    """Full processing pipeline for a saved item."""
    from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
    from app.config import DATABASE_URL
-    from app.services.ingest import fetch_url_content, take_screenshot, archive_html
+    from app.services.ingest import crawl_url, save_screenshot_from_base64, download_og_image, archive_html
    from app.services.classify import classify_item
    from app.services.embed import generate_embedding
    from app.search.engine import index_item, ensure_meili_index
@@ -62,42 +63,96 @@ async def _process_item(item_id: str):

            # ── Step 1: Fetch content for URLs ──
            if item.type == "link" and item.url:
-                log.info(f"Fetching URL: {item.url}")
-                content = await fetch_url_content(item.url)
-                html_content = content.get("html")
-                extracted_text = content.get("text") or extracted_text
-                if not title:
-                    title = content.get("title")
+                from app.services.ingest import (
+                    _is_youtube_url, download_youtube_thumbnail,
+                    download_youtube_video, fetch_youtube_metadata,
+                )
+
                item.metadata_json = item.metadata_json or {}
-                item.metadata_json["description"] = content.get("description")
-                item.metadata_json["used_browserless"] = content.get("used_browserless", False)
+                is_yt = _is_youtube_url(item.url)

-                # Take screenshot
-                screenshot_path = await take_screenshot(item.url, item.id)
-                if screenshot_path:
-                    asset = ItemAsset(
-                        id=str(uuid.uuid4()),
-                        item_id=item.id,
-                        asset_type="screenshot",
-                        filename="screenshot.png",
-                        content_type="image/png",
-                        storage_path=screenshot_path,
-                    )
-                    db.add(asset)
+                if is_yt:
+                    # YouTube: use oEmbed + thumbnail + yt-dlp (no crawler needed)
+                    log.info(f"Processing YouTube URL: {item.url}")
+                    yt_meta = await fetch_youtube_metadata(item.url)
+                    if yt_meta:
+                        if not title:
+                            title = yt_meta.get("title")
+                        extracted_text = f"YouTube: {yt_meta.get('title','')}\nBy: {yt_meta.get('author','')}"
+                        item.metadata_json["youtube"] = {
+                            "video_id": yt_meta.get("video_id"),
+                            "author": yt_meta.get("author"),
+                            "is_short": yt_meta.get("is_short", False),
+                        }
+                        item.metadata_json["description"] = f"YouTube video by {yt_meta.get('author','')}"

-                # Archive HTML
-                if html_content:
-                    html_path = await archive_html(html_content, item.id)
-                    if html_path:
-                        asset = ItemAsset(
-                            id=str(uuid.uuid4()),
-                            item_id=item.id,
-                            asset_type="archived_html",
-                            filename="page.html",
-                            content_type="text/html",
-                            storage_path=html_path,
-                        )
-                        db.add(asset)
+                    # Download video
+                    log.info(f"Downloading YouTube video: {item.url}")
+                    video_path, yt_info = await download_youtube_video(item.url, item.id)
+                    if video_path:
+                        db.add(ItemAsset(
+                            id=str(uuid.uuid4()), item_id=item.id,
+                            asset_type="video", filename=f"{yt_meta['video_id']}.mp4",
+                            content_type="video/mp4", storage_path=video_path,
+                        ))
+                        if yt_info.get("duration"):
+                            item.metadata_json["youtube"]["duration"] = yt_info["duration"]
+                        if yt_info.get("description"):
+                            item.metadata_json["youtube"]["description"] = yt_info["description"][:500]
+                            extracted_text = f"YouTube: {title or ''}\nBy: {(yt_meta or {}).get('author','')}\n{yt_info['description'][:2000]}"
+
+                    # Thumbnail
+                    thumb_path = await download_youtube_thumbnail(item.url, item.id)
+                    if thumb_path:
+                        db.add(ItemAsset(
+                            id=str(uuid.uuid4()), item_id=item.id,
+                            asset_type="screenshot", filename="thumbnail.jpg",
+                            content_type="image/jpeg", storage_path=thumb_path,
+                        ))
+
+                else:
+                    # Regular URL: use Playwright crawler (stealth)
+                    log.info(f"Crawling URL: {item.url}")
+                    crawl = await crawl_url(item.url)
+                    html_content = crawl.get("html")
+                    extracted_text = crawl.get("text") or extracted_text
+                    if not title:
+                        title = crawl.get("title")
+                    item.metadata_json["description"] = crawl.get("description")
+                    item.metadata_json["author"] = crawl.get("author")
+                    item.metadata_json["status_code"] = crawl.get("status_code")
+
+                    # Screenshot (from crawler, base64 JPEG)
+                    if crawl.get("screenshot"):
+                        ss_path = await save_screenshot_from_base64(crawl["screenshot"], item.id)
+                        if ss_path:
+                            db.add(ItemAsset(
+                                id=str(uuid.uuid4()), item_id=item.id,
+                                asset_type="screenshot", filename="screenshot.jpg",
+                                content_type="image/jpeg", storage_path=ss_path,
+                            ))
+
+                    # og:image (extracted from rendered DOM by crawler)
+                    og_url = crawl.get("og_image_url")
+                    if og_url:
+                        og_path = await download_og_image(og_url, item.id)
+                        if og_path:
+                            db.add(ItemAsset(
+                                id=str(uuid.uuid4()), item_id=item.id,
+                                asset_type="og_image", filename="og_image.jpg",
+                                content_type="image/jpeg", storage_path=og_path,
+                            ))
+                            item.metadata_json["og_image_url"] = og_url
+
+                    # Archive HTML
+                    if html_content:
+                        html_path = await archive_html(html_content, item.id)
+                        if html_path:
+                            db.add(ItemAsset(
+                                id=str(uuid.uuid4()), item_id=item.id,
+                                asset_type="archived_html", filename="page.html",
+                                content_type="text/html", storage_path=html_path,
+                            ))

            # ── Step 1b: Process uploaded files (PDF, image, document) ──
            if item.type in ("pdf", "image", "document", "file"):
--- a/services/brain/crawler/Dockerfile
+++ b/services/brain/crawler/Dockerfile
@@ -0,0 +1,20 @@
+FROM node:20-slim
+
+# Install Playwright system dependencies
+RUN npx playwright@1.50.0 install-deps chromium
+
+WORKDIR /app
+
+COPY package.json ./
+RUN npm install
+# Install Playwright browser
+RUN npx playwright install chromium
+
+COPY server.js ./
+
+ENV NODE_ENV=production
+EXPOSE 3100
+
+HEALTHCHECK --interval=15s --timeout=5s --retries=3 CMD wget -qO- http://localhost:3100/health || exit 1
+
+CMD ["node", "server.js"]
--- a/services/brain/crawler/package.json
+++ b/services/brain/crawler/package.json
@@ -0,0 +1,24 @@
+{
+  "name": "brain-crawler",
+  "version": "1.0.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "start": "node server.js"
+  },
+  "dependencies": {
+    "playwright-extra": "^4.3.6",
+    "puppeteer-extra-plugin-stealth": "^2.11.2",
+    "playwright-core": "^1.50.0",
+    "metascraper": "^5.45.25",
+    "metascraper-image": "^5.45.25",
+    "metascraper-title": "^5.45.25",
+    "metascraper-description": "^5.45.25",
+    "metascraper-author": "^5.45.25",
+    "metascraper-date": "^5.45.25",
+    "metascraper-publisher": "^5.45.25",
+    "metascraper-url": "^5.45.25",
+    "@mozilla/readability": "^0.5.0",
+    "jsdom": "^25.0.0"
+  }
+}
--- a/services/brain/crawler/server.js
+++ b/services/brain/crawler/server.js
@@ -0,0 +1,370 @@
+import http from "node:http";
+import { chromium } from "playwright-extra";
+import StealthPlugin from "puppeteer-extra-plugin-stealth";
+import { Readability } from "@mozilla/readability";
+import { JSDOM } from "jsdom";
+
+chromium.use(StealthPlugin());
+
+const PORT = parseInt(process.env.PORT || "3100");
+const VIEWPORT = { width: 1440, height: 900 };
+const USER_AGENT =
+  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
+const NAV_TIMEOUT = 30_000;
+const SCREENSHOT_TIMEOUT = 8_000;
+
+let browser = null;
+
+async function ensureBrowser() {
+  if (browser && browser.isConnected()) return browser;
+  if (browser) {
+    try { await browser.close(); } catch {}
+    browser = null;
+  }
+  console.log("[crawler] Launching browser...");
+  browser = await chromium.launch({
+    headless: true,
+    args: [
+      "--no-sandbox",
+      "--disable-setuid-sandbox",
+      "--disable-dev-shm-usage",
+      "--disable-gpu",
+    ],
+  });
+  console.log("[crawler] Browser ready");
+  return browser;
+}
+
+// Extract og:image and other meta from rendered HTML
+function extractMeta(html) {
+  const meta = {};
+
+  const patterns = {
+    og_image: [
+      /(?:property|name)=["']og:image["'][^>]*content=["']([^"']+)["']/i,
+      /content=["']([^"']+)["'][^>]*(?:property|name)=["']og:image["']/i,
+    ],
+    title: [
+      /(?:property|name)=["']og:title["'][^>]*content=["']([^"']+)["']/i,
+      /content=["']([^"']+)["'][^>]*(?:property|name)=["']og:title["']/i,
+      /<title[^>]*>([^<]+)<\/title>/i,
+    ],
+    description: [
+      /(?:property|name)=["']og:description["'][^>]*content=["']([^"']+)["']/i,
+      /name=["']description["'][^>]*content=["']([^"']+)["']/i,
+      /content=["']([^"']+)["'][^>]*(?:property|name)=["']og:description["']/i,
+    ],
+    author: [
+      /name=["']author["'][^>]*content=["']([^"']+)["']/i,
+      /property=["']article:author["'][^>]*content=["']([^"']+)["']/i,
+    ],
+    favicon: [
+      /rel=["']icon["'][^>]*href=["']([^"']+)["']/i,
+      /rel=["']shortcut icon["'][^>]*href=["']([^"']+)["']/i,
+    ],
+  };
+
+  for (const [key, pats] of Object.entries(patterns)) {
+    for (const pat of pats) {
+      const m = html.match(pat);
+      if (m) {
+        meta[key] = m[1].trim();
+        break;
+      }
+    }
+  }
+
+  return meta;
+}
+
+function isRedditUrl(url) {
+  try {
+    const h = new URL(url).hostname;
+    return h === "www.reddit.com" || h === "reddit.com";
+  } catch {}
+  return false;
+}
+
+async function resolveRedditShortUrl(url) {
+  // Reddit short URLs (/r/sub/s/xxx) redirect to the actual post
+  if (/\/s\/[a-zA-Z0-9]+/.test(url)) {
+    try {
+      const resp = await fetch(url, {
+        method: "HEAD",
+        redirect: "follow",
+        headers: { "User-Agent": "SecondBrain/1.0" },
+      });
+      const resolved = resp.url;
+      if (resolved && resolved.includes("/comments/")) {
+        console.log(`[crawler] Reddit short URL resolved: ${url} -> ${resolved}`);
+        return resolved;
+      }
+    } catch (e) {
+      console.warn("[crawler] Reddit short URL resolve failed:", e.message);
+    }
+  }
+  return url;
+}
+
+async function fetchRedditJson(url) {
+  // Resolve short URLs first
+  url = await resolveRedditShortUrl(url);
+
+  // Reddit JSON API — append .json to get structured data
+  try {
+    const jsonUrl = url.replace(/\/?(\?.*)?$/, "/.json$1");
+    const resp = await fetch(jsonUrl, {
+      headers: { "User-Agent": "SecondBrain/1.0" },
+      redirect: "follow",
+    });
+    if (!resp.ok) return null;
+    const data = await resp.json();
+    const post = data?.[0]?.data?.children?.[0]?.data;
+    if (!post) return null;
+
+    const previewImg = (post.preview?.images?.[0]?.source?.url || "").replace(/&amp;/g, "&") || null;
+    const thumbnail = post.thumbnail?.startsWith("http") ? post.thumbnail : null;
+
+    // If no preview image, try to get subreddit icon
+    let ogImage = previewImg || thumbnail || null;
+    if (!ogImage && post.subreddit) {
+      try {
+        const aboutResp = await fetch(
+          `https://www.reddit.com/r/${post.subreddit}/about.json`,
+          { headers: { "User-Agent": "SecondBrain/1.0" } }
+        );
+        if (aboutResp.ok) {
+          const about = await aboutResp.json();
+          const icon = about?.data?.community_icon?.replace(/&amp;/g, "&")?.split("?")?.[0]
+            || about?.data?.icon_img
+            || about?.data?.header_img;
+          if (icon && icon.startsWith("http")) {
+            ogImage = icon;
+          }
+        }
+      } catch {}
+    }
+
+    return {
+      url,
+      html: null,
+      text: `${post.title || ""}\n\n${post.selftext || ""}`.trim(),
+      title: post.title || null,
+      description: (post.selftext || "").slice(0, 200) || null,
+      author: post.author ? `u/${post.author}` : null,
+      og_image_url: ogImage ? ogImage.replace(/&amp;/g, "&") : null,
+      favicon: null,
+      screenshot: null,
+      status_code: 200,
+      error: null,
+      subreddit: post.subreddit_name_prefixed || null,
+    };
+  } catch (e) {
+    console.warn("[crawler] Reddit JSON failed:", e.message);
+    return null;
+  }
+}
+
+async function crawl(url) {
+  // Reddit: use JSON API (avoids login walls entirely)
+  if (isRedditUrl(url)) {
+    const redditData = await fetchRedditJson(url);
+    if (redditData) {
+      console.log(`[crawler] Reddit JSON OK: ${url} (og=${!!redditData.og_image_url})`);
+      return redditData;
+    }
+    console.log(`[crawler] Reddit JSON failed, falling back to browser: ${url}`);
+  }
+
+  const crawlUrl = url;
+  let b;
+  try {
+    b = await ensureBrowser();
+  } catch (e) {
+    console.error("[crawler] Browser launch failed, retrying:", e.message);
+    browser = null;
+    b = await ensureBrowser();
+  }
+  const contextOpts = {
+    viewport: VIEWPORT,
+    userAgent: USER_AGENT,
+    ignoreHTTPSErrors: true,
+  };
+
+  // Reddit: set cookies to bypass login walls
+  if (isRedditUrl(url)) {
+    contextOpts.extraHTTPHeaders = {
+      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+      "Accept-Language": "en-US,en;q=0.5",
+    };
+  }
+
+  const context = await b.newContext(contextOpts);
+
+  const page = await context.newPage();
+  const result = {
+    url,
+    html: null,
+    text: null,
+    readable_html: null,
+    title: null,
+    description: null,
+    author: null,
+    og_image_url: null,
+    favicon: null,
+    screenshot: null, // base64
+    status_code: null,
+    error: null,
+  };
+
+  try {
+    // Navigate (use normalized URL to avoid login walls)
+    const response = await page.goto(crawlUrl, {
+      waitUntil: "domcontentloaded",
+      timeout: NAV_TIMEOUT,
+    });
+    result.status_code = response?.status() || null;
+
+    // Wait for network to settle (up to 5s)
+    try {
+      await page.waitForLoadState("networkidle", { timeout: 5000 });
+    } catch {
+      // networkidle timeout is fine, page is probably loaded enough
+    }
+
+    // Reddit: dismiss login modals and overlays
+    if (isRedditUrl(url)) {
+      await page.evaluate(() => {
+        // Remove login modal/overlay
+        document.querySelectorAll('shreddit-overlay-display, [id*="login"], .overlay-container, reddit-cookie-banner').forEach(el => el.remove());
+        // Remove any body scroll locks
+        document.body.style.overflow = 'auto';
+        document.documentElement.style.overflow = 'auto';
+      }).catch(() => {});
+      await page.waitForTimeout(1000);
+    }
+
+    // Get rendered HTML + screenshot in parallel
+    const [html, screenshot] = await Promise.all([
+      page.content(),
+      page
+        .screenshot({ type: "jpeg", quality: 80, fullPage: false })
+        .catch((e) => {
+          console.warn("[crawler] Screenshot failed:", e.message);
+          return null;
+        }),
+    ]);
+
+    result.html = html;
+
+    // Extract text from page
+    result.text = await page
+      .evaluate(() => {
+        const el =
+          document.querySelector("article") ||
+          document.querySelector("main") ||
+          document.querySelector('[role="main"]') ||
+          document.body;
+        return el ? el.innerText.slice(0, 10000) : "";
+      })
+      .catch(() => "");
+
+    // Extract readable article HTML via Mozilla Readability
+    try {
+      const dom = new JSDOM(html, { url: crawlUrl });
+      const reader = new Readability(dom.window.document);
+      const article = reader.parse();
+      if (article && article.content) {
+        result.readable_html = article.content;
+        if (article.textContent) {
+          result.text = article.textContent.slice(0, 10000);
+        }
+      }
+    } catch (e) {
+      console.warn("[crawler] Readability failed:", e.message);
+    }
+
+    // Extract meta from rendered DOM
+    const meta = extractMeta(html);
+    result.title = meta.title || (await page.title()) || null;
+    result.description = meta.description || null;
+    result.author = meta.author || null;
+    result.og_image_url = meta.og_image || null;
+    result.favicon = meta.favicon || null;
+
+    // Screenshot as base64
+    if (screenshot) {
+      result.screenshot = screenshot.toString("base64");
+    }
+  } catch (e) {
+    result.error = e.message;
+    console.error("[crawler] Crawl error:", url, e.message);
+    // If browser crashed, reset it for next request
+    if (e.message.includes("closed") || e.message.includes("crashed")) {
+      browser = null;
+    }
+  } finally {
+    await page.close().catch(() => {});
+    await context.close().catch(() => {});
+  }
+
+  return result;
+}
+
+// Simple HTTP server
+const server = http.createServer(async (req, res) => {
+  // Health check
+  if (req.method === "GET" && req.url === "/health") {
+    res.writeHead(200, { "Content-Type": "application/json" });
+    res.end(JSON.stringify({ status: "ok" }));
+    return;
+  }
+
+  // Crawl endpoint
+  if (req.method === "POST" && req.url === "/crawl") {
+    let body = "";
+    req.on("data", (chunk) => (body += chunk));
+    req.on("end", async () => {
+      try {
+        const { url } = JSON.parse(body);
+        if (!url) {
+          res.writeHead(400, { "Content-Type": "application/json" });
+          res.end(JSON.stringify({ error: "url is required" }));
+          return;
+        }
+
+        console.log(`[crawler] Crawling: ${url}`);
+        const result = await crawl(url);
+        console.log(
+          `[crawler] Done: ${url} (status=${result.status_code}, og=${!!result.og_image_url}, ss=${!!result.screenshot})`
+        );
+
+        res.writeHead(200, { "Content-Type": "application/json" });
+        res.end(JSON.stringify(result));
+      } catch (e) {
+        console.error("[crawler] Request error:", e);
+        res.writeHead(500, { "Content-Type": "application/json" });
+        res.end(JSON.stringify({ error: e.message }));
+      }
+    });
+    return;
+  }
+
+  res.writeHead(404);
+  res.end("Not found");
+});
+
+// Startup
+(async () => {
+  await ensureBrowser();
+  server.listen(PORT, () => {
+    console.log(`[crawler] Listening on :${PORT}`);
+  });
+})();
+
+// Graceful shutdown
+process.on("SIGTERM", async () => {
+  console.log("[crawler] Shutting down...");
+  if (browser) await browser.close().catch(() => {});
+  process.exit(0);
+});
--- a/services/brain/docker-compose.yml
+++ b/services/brain/docker-compose.yml
@@ -13,7 +13,7 @@ services:
      - REDIS_URL=redis://brain-redis:6379/0
      - MEILI_URL=http://brain-meili:7700
      - MEILI_MASTER_KEY=${MEILI_MASTER_KEY:-brain-meili-secure-key-2026}
-      - BROWSERLESS_URL=http://brain-browserless:3000
+      - CRAWLER_URL=http://brain-crawler:3100
      - OPENAI_API_KEY=${OPENAI_API_KEY}
      - OPENAI_MODEL=${OPENAI_MODEL:-gpt-4o-mini}
      - PORT=8200
@@ -44,7 +44,7 @@ services:
      - REDIS_URL=redis://brain-redis:6379/0
      - MEILI_URL=http://brain-meili:7700
      - MEILI_MASTER_KEY=${MEILI_MASTER_KEY:-brain-meili-secure-key-2026}
-      - BROWSERLESS_URL=http://brain-browserless:3000
+      - CRAWLER_URL=http://brain-crawler:3100
      - OPENAI_API_KEY=${OPENAI_API_KEY}
      - OPENAI_MODEL=${OPENAI_MODEL:-gpt-4o-mini}
      - TZ=${TZ:-America/Chicago}
@@ -90,14 +90,17 @@ services:
    volumes:
      - ./data/meili:/meili_data

-  # ── Browserless (headless Chrome for JS rendering + screenshots) ──
-  brain-browserless:
-    image: ghcr.io/browserless/chromium:latest
-    container_name: brain-browserless
+  # ── Crawler (Playwright + stealth for JS rendering + screenshots) ──
+  brain-crawler:
+    build:
+      context: ./crawler
+      dockerfile: Dockerfile
+    container_name: brain-crawler
    restart: unless-stopped
    environment:
-      - MAX_CONCURRENT_SESSIONS=3
-      - TIMEOUT=30000
+      - PORT=3100
+      - TZ=${TZ:-America/Chicago}
+    shm_size: '1gb'

 networks:
  pangolin:
--- a/services/brain/migrate_karakeep.py
+++ b/services/brain/migrate_karakeep.py
@@ -0,0 +1,254 @@
+"""Migrate all bookmarks from Karakeep into Brain service via API."""
+
+import json
+import os
+import sys
+import time
+import urllib.request
+import urllib.error
+import tempfile
+
+KARAKEEP_URL = os.environ.get("KARAKEEP_URL", "http://192.168.1.42:3005")
+KARAKEEP_API_KEY = os.environ.get("KARAKEEP_API_KEY", "ak2_f4141e5fe7265e23bd6f_4549c932c262010eafd08acb2139f1ac")
+BRAIN_URL = "http://localhost:8200"
+BRAIN_USER = "admin"
+
+
+def karakeep_get(path):
+    req = urllib.request.Request(
+        f"{KARAKEEP_URL}{path}",
+        headers={"Authorization": f"Bearer {KARAKEEP_API_KEY}"},
+    )
+    return json.loads(urllib.request.urlopen(req, timeout=30).read())
+
+
+def karakeep_download(asset_id):
+    req = urllib.request.Request(
+        f"{KARAKEEP_URL}/api/v1/assets/{asset_id}",
+        headers={"Authorization": f"Bearer {KARAKEEP_API_KEY}"},
+    )
+    resp = urllib.request.urlopen(req, timeout=120)
+    return resp.read(), resp.headers.get("Content-Type", "application/octet-stream")
+
+
+def brain_post_json(path, data):
+    body = json.dumps(data).encode()
+    req = urllib.request.Request(
+        f"{BRAIN_URL}/api{path}",
+        data=body,
+        headers={"X-Gateway-User-Id": BRAIN_USER, "Content-Type": "application/json"},
+        method="POST",
+    )
+    resp = urllib.request.urlopen(req, timeout=30)
+    return json.loads(resp.read())
+
+
+def brain_upload(file_data, filename, content_type, title=None):
+    """Multipart upload to /api/items/upload."""
+    boundary = "----MigrationBoundary12345"
+    parts = []
+
+    # File part
+    parts.append(f"--{boundary}\r\n".encode())
+    parts.append(f'Content-Disposition: form-data; name="file"; filename="{filename}"\r\n'.encode())
+    parts.append(f"Content-Type: {content_type}\r\n\r\n".encode())
+    parts.append(file_data)
+    parts.append(b"\r\n")
+
+    # Title part
+    if title:
+        parts.append(f"--{boundary}\r\n".encode())
+        parts.append(b'Content-Disposition: form-data; name="title"\r\n\r\n')
+        parts.append(title.encode())
+        parts.append(b"\r\n")
+
+    parts.append(f"--{boundary}--\r\n".encode())
+    body = b"".join(parts)
+
+    req = urllib.request.Request(
+        f"{BRAIN_URL}/api/items/upload",
+        data=body,
+        headers={
+            "X-Gateway-User-Id": BRAIN_USER,
+            "Content-Type": f"multipart/form-data; boundary={boundary}",
+        },
+        method="POST",
+    )
+    resp = urllib.request.urlopen(req, timeout=60)
+    return json.loads(resp.read())
+
+
+def brain_get_item(item_id):
+    req = urllib.request.Request(
+        f"{BRAIN_URL}/api/items/{item_id}",
+        headers={"X-Gateway-User-Id": BRAIN_USER},
+    )
+    resp = urllib.request.urlopen(req, timeout=15)
+    return json.loads(resp.read())
+
+
+def fetch_all_bookmarks():
+    all_bk = []
+    cursor = None
+    while True:
+        url = "/api/v1/bookmarks?limit=100"
+        if cursor:
+            url += f"&cursor={cursor}"
+        data = karakeep_get(url)
+        bks = data.get("bookmarks", [])
+        all_bk.extend(bks)
+        cursor = data.get("nextCursor")
+        if not cursor or not bks:
+            break
+    return all_bk
+
+
+def wait_for_processing(item_id, timeout=120):
+    """Poll until item is done processing."""
+    start = time.time()
+    while time.time() - start < timeout:
+        item = brain_get_item(item_id)
+        status = item.get("processing_status", "pending")
+        if status in ("ready", "error"):
+            return item
+        time.sleep(3)
+    return brain_get_item(item_id)
+
+
+def main():
+    print("Fetching all Karakeep bookmarks...")
+    bookmarks = fetch_all_bookmarks()
+    print(f"Found {len(bookmarks)} bookmarks\n")
+
+    # Sort: notes first, then links, then assets (PDFs take longer)
+    def sort_key(b):
+        t = b.get("content", {}).get("type", "")
+        return {"text": 0, "link": 1, "asset": 2}.get(t, 3)
+    bookmarks.sort(key=sort_key)
+
+    results = {"success": 0, "error": 0, "skipped": 0}
+    comparison = []
+
+    for i, bk in enumerate(bookmarks):
+        content = bk.get("content", {})
+        bk_type = content.get("type", "unknown")
+        bk_title = bk.get("title") or "Untitled"
+        bk_tags = [t["name"] for t in bk.get("tags", [])]
+        bk_list = bk.get("list", {})
+        bk_folder = bk_list.get("name") if bk_list else None
+
+        print(f"[{i+1}/{len(bookmarks)}] {bk_type}: {bk_title[:60]}")
+
+        try:
+            if bk_type == "link":
+                url = content.get("url", "")
+                if not url:
+                    print("  SKIP: no URL")
+                    results["skipped"] += 1
+                    continue
+                resp = brain_post_json("/items", {
+                    "type": "link",
+                    "url": url,
+                    "title": bk_title if bk_title != "Untitled" else None,
+                })
+
+            elif bk_type == "text":
+                text = content.get("text", "")
+                if not text:
+                    print("  SKIP: no text")
+                    results["skipped"] += 1
+                    continue
+                resp = brain_post_json("/items", {
+                    "type": "note",
+                    "raw_content": text,
+                    "title": bk_title if bk_title != "Untitled" else None,
+                })
+
+            elif bk_type == "asset":
+                asset_id = content.get("assetId")
+                asset_type = content.get("assetType", "unknown")
+                if not asset_id:
+                    print("  SKIP: no assetId")
+                    results["skipped"] += 1
+                    continue
+
+                print(f"  Downloading {asset_type} ({asset_id[:8]})...")
+                file_data, ct = karakeep_download(asset_id)
+                ext = {"pdf": ".pdf", "image": ".png"}.get(asset_type, ".bin")
+                filename = f"{bk_title[:50]}{ext}" if bk_title != "Untitled" else f"upload{ext}"
+                # Clean filename
+                filename = filename.replace("/", "-").replace("\\", "-")
+                if asset_type == "pdf":
+                    ct = "application/pdf"
+                resp = brain_upload(file_data, filename, ct, title=bk_title if bk_title != "Untitled" else None)
+            else:
+                print(f"  SKIP: unknown type '{bk_type}'")
+                results["skipped"] += 1
+                continue
+
+            item_id = resp.get("id")
+            print(f"  Created: {item_id} — waiting for AI classification...")
+
+            # Wait for processing
+            final = wait_for_processing(item_id, timeout=90)
+            status = final.get("processing_status", "?")
+            ai_folder = final.get("folder", "?")
+            ai_tags = final.get("tags", [])
+            ai_title = final.get("title", "?")
+
+            # Compare
+            entry = {
+                "karakeep_title": bk_title,
+                "karakeep_tags": bk_tags,
+                "karakeep_folder": bk_folder,
+                "ai_title": ai_title,
+                "ai_folder": ai_folder,
+                "ai_tags": ai_tags,
+                "status": status,
+            }
+            comparison.append(entry)
+
+            tag_match = "OK" if set(bk_tags) & set(ai_tags) or (not bk_tags and not ai_tags) else "DIFF"
+
+            print(f"  Status: {status}")
+            print(f"  AI Folder: {ai_folder} (Karakeep: {bk_folder or 'none'})")
+            print(f"  AI Tags: {ai_tags} vs Karakeep: {bk_tags} [{tag_match}]")
+            print(f"  AI Title: {ai_title}")
+
+            results["success"] += 1
+
+        except Exception as e:
+            print(f"  ERROR: {e}")
+            results["error"] += 1
+
+        print()
+
+    # Summary
+    print("=" * 60)
+    print(f"MIGRATION COMPLETE")
+    print(f"  Success: {results['success']}")
+    print(f"  Errors:  {results['error']}")
+    print(f"  Skipped: {results['skipped']}")
+    print()
+
+    # Tag comparison summary
+    matches = 0
+    diffs = 0
+    for c in comparison:
+        kk = set(c["karakeep_tags"])
+        ai = set(c["ai_tags"])
+        if kk & ai or (not kk and not ai):
+            matches += 1
+        else:
+            diffs += 1
+    print(f"Tag overlap: {matches}/{len(comparison)} items had at least one matching tag")
+    print(f"Tag differences: {diffs}/{len(comparison)} items had zero overlap")
+
+    # Save comparison
+    with open("/tmp/migration_comparison.json", "w") as f:
+        json.dump(comparison, f, indent=2)
+    print("\nFull comparison saved to /tmp/migration_comparison.json")
+
+
+if __name__ == "__main__":
+    main()