platform/services/brain/app/services/ingest.py

"""Content ingestion — Playwright crawler for HTML, screenshots, og:image."""

import base64
import logging
import re
from urllib.parse import urlparse

import httpx

from app.config import CRAWLER_URL
from app.services.storage import storage

log = logging.getLogger(__name__)


# ── YouTube helpers ──

def _extract_youtube_id(url: str) -> str | None:
    patterns = [
        r'(?:youtube\.com/watch\?.*v=|youtu\.be/|youtube\.com/shorts/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
    ]
    for pat in patterns:
        m = re.search(pat, url)
        if m:
            return m.group(1)
    return None


def _is_youtube_url(url: str) -> bool:
    return bool(_extract_youtube_id(url))


async def fetch_youtube_metadata(url: str) -> dict | None:
    """Fetch YouTube video metadata via oEmbed. No API key needed."""
    video_id = _extract_youtube_id(url)
    if not video_id:
        return None

    result = {
        "title": None,
        "description": None,
        "author": None,
        "thumbnail_url": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
        "video_id": video_id,
        "is_short": "/shorts/" in url,
    }

    try:
        async with httpx.AsyncClient(timeout=10) as client:
            oembed_url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
            resp = await client.get(oembed_url)
            if resp.status_code == 200:
                data = resp.json()
                result["title"] = data.get("title")
                result["author"] = data.get("author_name")

            noembed_url = f"https://noembed.com/embed?url=https://www.youtube.com/watch?v={video_id}"
            resp2 = await client.get(noembed_url)
            if resp2.status_code == 200:
                data2 = resp2.json()
                if not result["title"]:
                    result["title"] = data2.get("title")
                if not result["author"]:
                    result["author"] = data2.get("author_name")
    except Exception as e:
        log.warning(f"YouTube metadata fetch failed: {e}")

    return result


async def download_youtube_thumbnail(url: str, item_id: str) -> str | None:
    """Download YouTube thumbnail and save as screenshot asset."""
    video_id = _extract_youtube_id(url)
    if not video_id:
        return None

    urls_to_try = [
        f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
        f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg",
    ]
    try:
        async with httpx.AsyncClient(timeout=10) as client:
            for thumb_url in urls_to_try:
                resp = await client.get(thumb_url)
                if resp.status_code == 200 and len(resp.content) > 1000:
                    path = storage.save(
                        item_id=item_id, asset_type="screenshot",
                        filename="thumbnail.jpg", data=resp.content,
                    )
                    return path
    except Exception as e:
        log.warning(f"YouTube thumbnail download failed: {e}")
    return None


async def download_youtube_video(url: str, item_id: str) -> tuple[str | None, dict]:
    """Download YouTube video via yt-dlp."""
    import asyncio
    import subprocess
    import tempfile
    import os

    video_id = _extract_youtube_id(url)
    if not video_id:
        return None, {}

    with tempfile.TemporaryDirectory() as tmpdir:
        outpath = os.path.join(tmpdir, "%(id)s.%(ext)s")
        cmd = [
            "yt-dlp", "--no-playlist",
            "-f", "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best[height<=720]",
            "--merge-output-format", "mp4",
            "--write-info-json", "--no-write-playlist-metafiles",
            "-o", outpath, url,
        ]
        try:
            proc = await asyncio.to_thread(
                subprocess.run, cmd, capture_output=True, text=True, timeout=120,
            )
            if proc.returncode != 0:
                log.warning(f"yt-dlp failed: {proc.stderr[:300]}")
                return None, {}

            video_file = None
            info = {}
            for f in os.listdir(tmpdir):
                if f.endswith(".mp4"):
                    video_file = os.path.join(tmpdir, f)
                elif f.endswith(".info.json"):
                    import json as _json
                    with open(os.path.join(tmpdir, f)) as fh:
                        info = _json.load(fh)

            if not video_file:
                return None, {}

            file_data = open(video_file, "rb").read()
            path = storage.save(
                item_id=item_id, asset_type="video",
                filename=f"{video_id}.mp4", data=file_data,
            )
            log.info(f"Downloaded YouTube video: {len(file_data)} bytes -> {path}")
            return path, info
        except subprocess.TimeoutExpired:
            log.warning(f"yt-dlp timed out for {url}")
            return None, {}
        except Exception as e:
            log.error(f"YouTube download failed: {e}")
            return None, {}


# ── Main crawler (Playwright stealth service) ──

async def crawl_url(url: str) -> dict:
    """Call the Playwright crawler service. Returns dict with html, text, title,
    description, author, og_image_url, screenshot (base64), status_code, error."""
    try:
        async with httpx.AsyncClient(timeout=45) as client:
            resp = await client.post(f"{CRAWLER_URL}/crawl", json={"url": url})
            if resp.status_code == 200:
                return resp.json()
            log.warning(f"Crawler returned {resp.status_code} for {url}")
    except Exception as e:
        log.error(f"Crawler request failed for {url}: {e}")
    return {"url": url, "html": None, "text": None, "title": None,
            "description": None, "og_image_url": None, "screenshot": None, "error": str(e) if 'e' in dir() else "unknown"}


async def save_screenshot_from_base64(b64: str, item_id: str) -> str | None:
    """Decode base64 screenshot and save to storage."""
    try:
        data = base64.b64decode(b64)
        if len(data) < 500:
            return None
        path = storage.save(
            item_id=item_id, asset_type="screenshot",
            filename="screenshot.jpg", data=data,
        )
        return path
    except Exception as e:
        log.error(f"Screenshot save failed: {e}")
    return None


async def download_og_image(og_url: str, item_id: str) -> str | None:
    """Download an og:image and save as asset."""
    # Clean HTML entities from URL
    og_url = og_url.replace("&amp;", "&")
    try:
        async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
            resp = await client.get(og_url, headers={
                "User-Agent": "Mozilla/5.0 (compatible; SecondBrain/1.0)"
            })
            if resp.status_code == 200 and len(resp.content) > 1000:
                ct = resp.headers.get("content-type", "image/jpeg")
                ext = "png" if "png" in ct else "jpg"
                path = storage.save(
                    item_id=item_id, asset_type="og_image",
                    filename=f"og_image.{ext}", data=resp.content,
                )
                log.info(f"Downloaded og:image ({len(resp.content)} bytes) for {item_id}")
                return path
    except Exception as e:
        log.warning(f"og:image download failed: {e}")
    return None


async def archive_html(html: str, item_id: str) -> str | None:
    """Save full HTML as an archived asset."""
    if not html:
        return None
    try:
        path = storage.save(
            item_id=item_id, asset_type="archived_html",
            filename="page.html", data=html.encode("utf-8"),
        )
        return path
    except Exception as e:
        log.error(f"HTML archive failed: {e}")
    return None