Brain Service: - Playwright stealth crawler replacing browserless (og:image, Readability, Reddit JSON API) - AI classification with tag definitions and folder assignment - YouTube video download via yt-dlp - Karakeep migration complete (96 items) - Taxonomy management (folders with icons/colors, tags) - Discovery shuffle, sort options, search (Meilisearch + pgvector) - Item tag/folder editing, card color accents RSS Reader Service: - Custom FastAPI reader replacing Miniflux - Feed management (add/delete/refresh), category support - Full article extraction via Readability - Background content fetching for new entries - Mark all read with confirmation - Infinite scroll, retention cleanup (30/60 day) - 17 feeds migrated from Miniflux iOS App (SwiftUI): - Native iOS 17+ app with @Observable architecture - Cookie-based auth, configurable gateway URL - Dashboard with custom background photo + frosted glass widgets - Full fitness module (today/templates/goals/food library) - AI assistant chat (fitness + brain, raw JSON state management) - 120fps ProMotion support AI Assistants (Gateway): - Unified dispatcher with fitness/brain domain detection - Fitness: natural language food logging, photo analysis, multi-item splitting - Brain: save/append/update/delete notes, search & answer, undo support - Madiha user gets fitness-only (brain disabled) Firefox Extension: - One-click save to Brain from any page - Login with platform credentials - Right-click context menu (save page/link/image) - Notes field for URL saves - Signed and published on AMO Other: - Reader bookmark button routes to Brain (was Karakeep) - Fitness food library with "Add" button + add-to-meal popup - Kindle send file size check (25MB SMTP2GO limit) - Atelier UI as default (useAtelierShell=true) - Mobile upload box in nav drawer Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
221 lines
7.7 KiB
Python
221 lines
7.7 KiB
Python
"""Content ingestion — Playwright crawler for HTML, screenshots, og:image."""
|
|
|
|
import base64
|
|
import logging
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
|
|
from app.config import CRAWLER_URL
|
|
from app.services.storage import storage
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# ── YouTube helpers ──
|
|
|
|
def _extract_youtube_id(url: str) -> str | None:
|
|
patterns = [
|
|
r'(?:youtube\.com/watch\?.*v=|youtu\.be/|youtube\.com/shorts/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
|
|
]
|
|
for pat in patterns:
|
|
m = re.search(pat, url)
|
|
if m:
|
|
return m.group(1)
|
|
return None
|
|
|
|
|
|
def _is_youtube_url(url: str) -> bool:
|
|
return bool(_extract_youtube_id(url))
|
|
|
|
|
|
async def fetch_youtube_metadata(url: str) -> dict | None:
|
|
"""Fetch YouTube video metadata via oEmbed. No API key needed."""
|
|
video_id = _extract_youtube_id(url)
|
|
if not video_id:
|
|
return None
|
|
|
|
result = {
|
|
"title": None,
|
|
"description": None,
|
|
"author": None,
|
|
"thumbnail_url": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
|
|
"video_id": video_id,
|
|
"is_short": "/shorts/" in url,
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10) as client:
|
|
oembed_url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
|
|
resp = await client.get(oembed_url)
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
result["title"] = data.get("title")
|
|
result["author"] = data.get("author_name")
|
|
|
|
noembed_url = f"https://noembed.com/embed?url=https://www.youtube.com/watch?v={video_id}"
|
|
resp2 = await client.get(noembed_url)
|
|
if resp2.status_code == 200:
|
|
data2 = resp2.json()
|
|
if not result["title"]:
|
|
result["title"] = data2.get("title")
|
|
if not result["author"]:
|
|
result["author"] = data2.get("author_name")
|
|
except Exception as e:
|
|
log.warning(f"YouTube metadata fetch failed: {e}")
|
|
|
|
return result
|
|
|
|
|
|
async def download_youtube_thumbnail(url: str, item_id: str) -> str | None:
|
|
"""Download YouTube thumbnail and save as screenshot asset."""
|
|
video_id = _extract_youtube_id(url)
|
|
if not video_id:
|
|
return None
|
|
|
|
urls_to_try = [
|
|
f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
|
|
f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg",
|
|
]
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10) as client:
|
|
for thumb_url in urls_to_try:
|
|
resp = await client.get(thumb_url)
|
|
if resp.status_code == 200 and len(resp.content) > 1000:
|
|
path = storage.save(
|
|
item_id=item_id, asset_type="screenshot",
|
|
filename="thumbnail.jpg", data=resp.content,
|
|
)
|
|
return path
|
|
except Exception as e:
|
|
log.warning(f"YouTube thumbnail download failed: {e}")
|
|
return None
|
|
|
|
|
|
async def download_youtube_video(url: str, item_id: str) -> tuple[str | None, dict]:
|
|
"""Download YouTube video via yt-dlp."""
|
|
import asyncio
|
|
import subprocess
|
|
import tempfile
|
|
import os
|
|
|
|
video_id = _extract_youtube_id(url)
|
|
if not video_id:
|
|
return None, {}
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
outpath = os.path.join(tmpdir, "%(id)s.%(ext)s")
|
|
cmd = [
|
|
"yt-dlp", "--no-playlist",
|
|
"-f", "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best[height<=720]",
|
|
"--merge-output-format", "mp4",
|
|
"--write-info-json", "--no-write-playlist-metafiles",
|
|
"-o", outpath, url,
|
|
]
|
|
try:
|
|
proc = await asyncio.to_thread(
|
|
subprocess.run, cmd, capture_output=True, text=True, timeout=120,
|
|
)
|
|
if proc.returncode != 0:
|
|
log.warning(f"yt-dlp failed: {proc.stderr[:300]}")
|
|
return None, {}
|
|
|
|
video_file = None
|
|
info = {}
|
|
for f in os.listdir(tmpdir):
|
|
if f.endswith(".mp4"):
|
|
video_file = os.path.join(tmpdir, f)
|
|
elif f.endswith(".info.json"):
|
|
import json as _json
|
|
with open(os.path.join(tmpdir, f)) as fh:
|
|
info = _json.load(fh)
|
|
|
|
if not video_file:
|
|
return None, {}
|
|
|
|
file_data = open(video_file, "rb").read()
|
|
path = storage.save(
|
|
item_id=item_id, asset_type="video",
|
|
filename=f"{video_id}.mp4", data=file_data,
|
|
)
|
|
log.info(f"Downloaded YouTube video: {len(file_data)} bytes -> {path}")
|
|
return path, info
|
|
except subprocess.TimeoutExpired:
|
|
log.warning(f"yt-dlp timed out for {url}")
|
|
return None, {}
|
|
except Exception as e:
|
|
log.error(f"YouTube download failed: {e}")
|
|
return None, {}
|
|
|
|
|
|
# ── Main crawler (Playwright stealth service) ──
|
|
|
|
async def crawl_url(url: str) -> dict:
|
|
"""Call the Playwright crawler service. Returns dict with html, text, title,
|
|
description, author, og_image_url, screenshot (base64), status_code, error."""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=45) as client:
|
|
resp = await client.post(f"{CRAWLER_URL}/crawl", json={"url": url})
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
log.warning(f"Crawler returned {resp.status_code} for {url}")
|
|
except Exception as e:
|
|
log.error(f"Crawler request failed for {url}: {e}")
|
|
return {"url": url, "html": None, "text": None, "title": None,
|
|
"description": None, "og_image_url": None, "screenshot": None, "error": str(e) if 'e' in dir() else "unknown"}
|
|
|
|
|
|
async def save_screenshot_from_base64(b64: str, item_id: str) -> str | None:
|
|
"""Decode base64 screenshot and save to storage."""
|
|
try:
|
|
data = base64.b64decode(b64)
|
|
if len(data) < 500:
|
|
return None
|
|
path = storage.save(
|
|
item_id=item_id, asset_type="screenshot",
|
|
filename="screenshot.jpg", data=data,
|
|
)
|
|
return path
|
|
except Exception as e:
|
|
log.error(f"Screenshot save failed: {e}")
|
|
return None
|
|
|
|
|
|
async def download_og_image(og_url: str, item_id: str) -> str | None:
|
|
"""Download an og:image and save as asset."""
|
|
# Clean HTML entities from URL
|
|
og_url = og_url.replace("&", "&")
|
|
try:
|
|
async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
|
|
resp = await client.get(og_url, headers={
|
|
"User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
|
|
})
|
|
if resp.status_code == 200 and len(resp.content) > 1000:
|
|
ct = resp.headers.get("content-type", "image/jpeg")
|
|
ext = "png" if "png" in ct else "jpg"
|
|
path = storage.save(
|
|
item_id=item_id, asset_type="og_image",
|
|
filename=f"og_image.{ext}", data=resp.content,
|
|
)
|
|
log.info(f"Downloaded og:image ({len(resp.content)} bytes) for {item_id}")
|
|
return path
|
|
except Exception as e:
|
|
log.warning(f"og:image download failed: {e}")
|
|
return None
|
|
|
|
|
|
async def archive_html(html: str, item_id: str) -> str | None:
|
|
"""Save full HTML as an archived asset."""
|
|
if not html:
|
|
return None
|
|
try:
|
|
path = storage.save(
|
|
item_id=item_id, asset_type="archived_html",
|
|
filename="page.html", data=html.encode("utf-8"),
|
|
)
|
|
return path
|
|
except Exception as e:
|
|
log.error(f"HTML archive failed: {e}")
|
|
return None
|