feat: major platform expansion — Brain service, RSS reader, iOS app, AI assistants, Firefox extension

Brain Service: - Playwright stealth crawler replacing browserless (og:image, Readability, Reddit JSON API) - AI classification with tag definitions and folder assignment - YouTube video download via yt-dlp - Karakeep migration complete (96 items) - Taxonomy management (folders with icons/colors, tags) - Discovery shuffle, sort options, search (Meilisearch + pgvector) - Item tag/folder editing, card color accents RSS Reader Service: - Custom FastAPI reader replacing Miniflux - Feed management (add/delete/refresh), category support - Full article extraction via Readability - Background content fetching for new entries - Mark all read with confirmation - Infinite scroll, retention cleanup (30/60 day) - 17 feeds migrated from Miniflux iOS App (SwiftUI): - Native iOS 17+ app with @Observable architecture - Cookie-based auth, configurable gateway URL - Dashboard with custom background photo + frosted glass widgets - Full fitness module (today/templates/goals/food library) - AI assistant chat (fitness + brain, raw JSON state management) - 120fps ProMotion support AI Assistants (Gateway): - Unified dispatcher with fitness/brain domain detection - Fitness: natural language food logging, photo analysis, multi-item splitting - Brain: save/append/update/delete notes, search & answer, undo support - Madiha user gets fitness-only (brain disabled) Firefox Extension: - One-click save to Brain from any page - Login with platform credentials - Right-click context menu (save page/link/image) - Notes field for URL saves - Signed and published on AMO Other: - Reader bookmark button routes to Brain (was Karakeep) - Fitness food library with "Add" button + add-to-meal popup - Kindle send file size check (25MB SMTP2GO limit) - Atelier UI as default (useAtelierShell=true) - Mobile upload box in nav drawer Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 00:56:29 -05:00
parent af1765bd8e
commit 4592e35732
97 changed files with 11009 additions and 532 deletions
--- a/services/brain/app/worker/tasks.py
+++ b/services/brain/app/worker/tasks.py
@@ -12,6 +12,7 @@ from sqlalchemy.orm import selectinload

 from app.config import REDIS_URL, DATABASE_URL_SYNC
 from app.models.item import Item, ItemAsset
+from app.models.taxonomy import Folder, Tag, ItemTag  # noqa: F401 — register FK targets

 log = logging.getLogger(__name__)

@@ -34,7 +35,7 @@ async def _process_item(item_id: str):
    """Full processing pipeline for a saved item."""
    from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
    from app.config import DATABASE_URL
-    from app.services.ingest import fetch_url_content, take_screenshot, archive_html
+    from app.services.ingest import crawl_url, save_screenshot_from_base64, download_og_image, archive_html
    from app.services.classify import classify_item
    from app.services.embed import generate_embedding
    from app.search.engine import index_item, ensure_meili_index
@@ -62,42 +63,96 @@ async def _process_item(item_id: str):

            # ── Step 1: Fetch content for URLs ──
            if item.type == "link" and item.url:
-                log.info(f"Fetching URL: {item.url}")
-                content = await fetch_url_content(item.url)
-                html_content = content.get("html")
-                extracted_text = content.get("text") or extracted_text
-                if not title:
-                    title = content.get("title")
+                from app.services.ingest import (
+                    _is_youtube_url, download_youtube_thumbnail,
+                    download_youtube_video, fetch_youtube_metadata,
+                )
+
                item.metadata_json = item.metadata_json or {}
-                item.metadata_json["description"] = content.get("description")
-                item.metadata_json["used_browserless"] = content.get("used_browserless", False)
+                is_yt = _is_youtube_url(item.url)

-                # Take screenshot
-                screenshot_path = await take_screenshot(item.url, item.id)
-                if screenshot_path:
-                    asset = ItemAsset(
-                        id=str(uuid.uuid4()),
-                        item_id=item.id,
-                        asset_type="screenshot",
-                        filename="screenshot.png",
-                        content_type="image/png",
-                        storage_path=screenshot_path,
-                    )
-                    db.add(asset)
+                if is_yt:
+                    # YouTube: use oEmbed + thumbnail + yt-dlp (no crawler needed)
+                    log.info(f"Processing YouTube URL: {item.url}")
+                    yt_meta = await fetch_youtube_metadata(item.url)
+                    if yt_meta:
+                        if not title:
+                            title = yt_meta.get("title")
+                        extracted_text = f"YouTube: {yt_meta.get('title','')}\nBy: {yt_meta.get('author','')}"
+                        item.metadata_json["youtube"] = {
+                            "video_id": yt_meta.get("video_id"),
+                            "author": yt_meta.get("author"),
+                            "is_short": yt_meta.get("is_short", False),
+                        }
+                        item.metadata_json["description"] = f"YouTube video by {yt_meta.get('author','')}"

-                # Archive HTML
-                if html_content:
-                    html_path = await archive_html(html_content, item.id)
-                    if html_path:
-                        asset = ItemAsset(
-                            id=str(uuid.uuid4()),
-                            item_id=item.id,
-                            asset_type="archived_html",
-                            filename="page.html",
-                            content_type="text/html",
-                            storage_path=html_path,
-                        )
-                        db.add(asset)
+                    # Download video
+                    log.info(f"Downloading YouTube video: {item.url}")
+                    video_path, yt_info = await download_youtube_video(item.url, item.id)
+                    if video_path:
+                        db.add(ItemAsset(
+                            id=str(uuid.uuid4()), item_id=item.id,
+                            asset_type="video", filename=f"{yt_meta['video_id']}.mp4",
+                            content_type="video/mp4", storage_path=video_path,
+                        ))
+                        if yt_info.get("duration"):
+                            item.metadata_json["youtube"]["duration"] = yt_info["duration"]
+                        if yt_info.get("description"):
+                            item.metadata_json["youtube"]["description"] = yt_info["description"][:500]
+                            extracted_text = f"YouTube: {title or ''}\nBy: {(yt_meta or {}).get('author','')}\n{yt_info['description'][:2000]}"
+
+                    # Thumbnail
+                    thumb_path = await download_youtube_thumbnail(item.url, item.id)
+                    if thumb_path:
+                        db.add(ItemAsset(
+                            id=str(uuid.uuid4()), item_id=item.id,
+                            asset_type="screenshot", filename="thumbnail.jpg",
+                            content_type="image/jpeg", storage_path=thumb_path,
+                        ))
+
+                else:
+                    # Regular URL: use Playwright crawler (stealth)
+                    log.info(f"Crawling URL: {item.url}")
+                    crawl = await crawl_url(item.url)
+                    html_content = crawl.get("html")
+                    extracted_text = crawl.get("text") or extracted_text
+                    if not title:
+                        title = crawl.get("title")
+                    item.metadata_json["description"] = crawl.get("description")
+                    item.metadata_json["author"] = crawl.get("author")
+                    item.metadata_json["status_code"] = crawl.get("status_code")
+
+                    # Screenshot (from crawler, base64 JPEG)
+                    if crawl.get("screenshot"):
+                        ss_path = await save_screenshot_from_base64(crawl["screenshot"], item.id)
+                        if ss_path:
+                            db.add(ItemAsset(
+                                id=str(uuid.uuid4()), item_id=item.id,
+                                asset_type="screenshot", filename="screenshot.jpg",
+                                content_type="image/jpeg", storage_path=ss_path,
+                            ))
+
+                    # og:image (extracted from rendered DOM by crawler)
+                    og_url = crawl.get("og_image_url")
+                    if og_url:
+                        og_path = await download_og_image(og_url, item.id)
+                        if og_path:
+                            db.add(ItemAsset(
+                                id=str(uuid.uuid4()), item_id=item.id,
+                                asset_type="og_image", filename="og_image.jpg",
+                                content_type="image/jpeg", storage_path=og_path,
+                            ))
+                            item.metadata_json["og_image_url"] = og_url
+
+                    # Archive HTML
+                    if html_content:
+                        html_path = await archive_html(html_content, item.id)
+                        if html_path:
+                            db.add(ItemAsset(
+                                id=str(uuid.uuid4()), item_id=item.id,
+                                asset_type="archived_html", filename="page.html",
+                                content_type="text/html", storage_path=html_path,
+                            ))

            # ── Step 1b: Process uploaded files (PDF, image, document) ──
            if item.type in ("pdf", "image", "document", "file"):