feat: major platform expansion — Brain service, RSS reader, iOS app, AI assistants, Firefox extension
Brain Service: - Playwright stealth crawler replacing browserless (og:image, Readability, Reddit JSON API) - AI classification with tag definitions and folder assignment - YouTube video download via yt-dlp - Karakeep migration complete (96 items) - Taxonomy management (folders with icons/colors, tags) - Discovery shuffle, sort options, search (Meilisearch + pgvector) - Item tag/folder editing, card color accents RSS Reader Service: - Custom FastAPI reader replacing Miniflux - Feed management (add/delete/refresh), category support - Full article extraction via Readability - Background content fetching for new entries - Mark all read with confirmation - Infinite scroll, retention cleanup (30/60 day) - 17 feeds migrated from Miniflux iOS App (SwiftUI): - Native iOS 17+ app with @Observable architecture - Cookie-based auth, configurable gateway URL - Dashboard with custom background photo + frosted glass widgets - Full fitness module (today/templates/goals/food library) - AI assistant chat (fitness + brain, raw JSON state management) - 120fps ProMotion support AI Assistants (Gateway): - Unified dispatcher with fitness/brain domain detection - Fitness: natural language food logging, photo analysis, multi-item splitting - Brain: save/append/update/delete notes, search & answer, undo support - Madiha user gets fitness-only (brain disabled) Firefox Extension: - One-click save to Brain from any page - Login with platform credentials - Right-click context menu (save page/link/image) - Notes field for URL saves - Signed and published on AMO Other: - Reader bookmark button routes to Brain (was Karakeep) - Fitness food library with "Add" button + add-to-meal popup - Kindle send file size check (25MB SMTP2GO limit) - Atelier UI as default (useAtelierShell=true) - Mobile upload box in nav drawer Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import REDIS_URL, DATABASE_URL_SYNC
|
||||
from app.models.item import Item, ItemAsset
|
||||
from app.models.taxonomy import Folder, Tag, ItemTag # noqa: F401 — register FK targets
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@@ -34,7 +35,7 @@ async def _process_item(item_id: str):
|
||||
"""Full processing pipeline for a saved item."""
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
||||
from app.config import DATABASE_URL
|
||||
from app.services.ingest import fetch_url_content, take_screenshot, archive_html
|
||||
from app.services.ingest import crawl_url, save_screenshot_from_base64, download_og_image, archive_html
|
||||
from app.services.classify import classify_item
|
||||
from app.services.embed import generate_embedding
|
||||
from app.search.engine import index_item, ensure_meili_index
|
||||
@@ -62,42 +63,96 @@ async def _process_item(item_id: str):
|
||||
|
||||
# ── Step 1: Fetch content for URLs ──
|
||||
if item.type == "link" and item.url:
|
||||
log.info(f"Fetching URL: {item.url}")
|
||||
content = await fetch_url_content(item.url)
|
||||
html_content = content.get("html")
|
||||
extracted_text = content.get("text") or extracted_text
|
||||
if not title:
|
||||
title = content.get("title")
|
||||
from app.services.ingest import (
|
||||
_is_youtube_url, download_youtube_thumbnail,
|
||||
download_youtube_video, fetch_youtube_metadata,
|
||||
)
|
||||
|
||||
item.metadata_json = item.metadata_json or {}
|
||||
item.metadata_json["description"] = content.get("description")
|
||||
item.metadata_json["used_browserless"] = content.get("used_browserless", False)
|
||||
is_yt = _is_youtube_url(item.url)
|
||||
|
||||
# Take screenshot
|
||||
screenshot_path = await take_screenshot(item.url, item.id)
|
||||
if screenshot_path:
|
||||
asset = ItemAsset(
|
||||
id=str(uuid.uuid4()),
|
||||
item_id=item.id,
|
||||
asset_type="screenshot",
|
||||
filename="screenshot.png",
|
||||
content_type="image/png",
|
||||
storage_path=screenshot_path,
|
||||
)
|
||||
db.add(asset)
|
||||
if is_yt:
|
||||
# YouTube: use oEmbed + thumbnail + yt-dlp (no crawler needed)
|
||||
log.info(f"Processing YouTube URL: {item.url}")
|
||||
yt_meta = await fetch_youtube_metadata(item.url)
|
||||
if yt_meta:
|
||||
if not title:
|
||||
title = yt_meta.get("title")
|
||||
extracted_text = f"YouTube: {yt_meta.get('title','')}\nBy: {yt_meta.get('author','')}"
|
||||
item.metadata_json["youtube"] = {
|
||||
"video_id": yt_meta.get("video_id"),
|
||||
"author": yt_meta.get("author"),
|
||||
"is_short": yt_meta.get("is_short", False),
|
||||
}
|
||||
item.metadata_json["description"] = f"YouTube video by {yt_meta.get('author','')}"
|
||||
|
||||
# Archive HTML
|
||||
if html_content:
|
||||
html_path = await archive_html(html_content, item.id)
|
||||
if html_path:
|
||||
asset = ItemAsset(
|
||||
id=str(uuid.uuid4()),
|
||||
item_id=item.id,
|
||||
asset_type="archived_html",
|
||||
filename="page.html",
|
||||
content_type="text/html",
|
||||
storage_path=html_path,
|
||||
)
|
||||
db.add(asset)
|
||||
# Download video
|
||||
log.info(f"Downloading YouTube video: {item.url}")
|
||||
video_path, yt_info = await download_youtube_video(item.url, item.id)
|
||||
if video_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="video", filename=f"{yt_meta['video_id']}.mp4",
|
||||
content_type="video/mp4", storage_path=video_path,
|
||||
))
|
||||
if yt_info.get("duration"):
|
||||
item.metadata_json["youtube"]["duration"] = yt_info["duration"]
|
||||
if yt_info.get("description"):
|
||||
item.metadata_json["youtube"]["description"] = yt_info["description"][:500]
|
||||
extracted_text = f"YouTube: {title or ''}\nBy: {(yt_meta or {}).get('author','')}\n{yt_info['description'][:2000]}"
|
||||
|
||||
# Thumbnail
|
||||
thumb_path = await download_youtube_thumbnail(item.url, item.id)
|
||||
if thumb_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="screenshot", filename="thumbnail.jpg",
|
||||
content_type="image/jpeg", storage_path=thumb_path,
|
||||
))
|
||||
|
||||
else:
|
||||
# Regular URL: use Playwright crawler (stealth)
|
||||
log.info(f"Crawling URL: {item.url}")
|
||||
crawl = await crawl_url(item.url)
|
||||
html_content = crawl.get("html")
|
||||
extracted_text = crawl.get("text") or extracted_text
|
||||
if not title:
|
||||
title = crawl.get("title")
|
||||
item.metadata_json["description"] = crawl.get("description")
|
||||
item.metadata_json["author"] = crawl.get("author")
|
||||
item.metadata_json["status_code"] = crawl.get("status_code")
|
||||
|
||||
# Screenshot (from crawler, base64 JPEG)
|
||||
if crawl.get("screenshot"):
|
||||
ss_path = await save_screenshot_from_base64(crawl["screenshot"], item.id)
|
||||
if ss_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="screenshot", filename="screenshot.jpg",
|
||||
content_type="image/jpeg", storage_path=ss_path,
|
||||
))
|
||||
|
||||
# og:image (extracted from rendered DOM by crawler)
|
||||
og_url = crawl.get("og_image_url")
|
||||
if og_url:
|
||||
og_path = await download_og_image(og_url, item.id)
|
||||
if og_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="og_image", filename="og_image.jpg",
|
||||
content_type="image/jpeg", storage_path=og_path,
|
||||
))
|
||||
item.metadata_json["og_image_url"] = og_url
|
||||
|
||||
# Archive HTML
|
||||
if html_content:
|
||||
html_path = await archive_html(html_content, item.id)
|
||||
if html_path:
|
||||
db.add(ItemAsset(
|
||||
id=str(uuid.uuid4()), item_id=item.id,
|
||||
asset_type="archived_html", filename="page.html",
|
||||
content_type="text/html", storage_path=html_path,
|
||||
))
|
||||
|
||||
# ── Step 1b: Process uploaded files (PDF, image, document) ──
|
||||
if item.type in ("pdf", "image", "document", "file"):
|
||||
|
||||
Reference in New Issue
Block a user