feat: thumbnail extraction for Reader — fixes all clients

Server-side (dashboard + iOS + any client): - Added thumbnail column to reader_entries - Worker extracts from media:thumbnail, media:content, enclosures, HTML img - API returns thumbnail in EntryOut with & decoding - Backfilled 260 existing entries iOS: - Prefers API thumbnail, falls back to client-side extraction - Decodes HTML entities in URLs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 19:32:47 -05:00
parent 798ba17a93
commit a3eabf3e3b
4 changed files with 59 additions and 1 deletions
--- a/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift
+++ b/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift
@@ -18,6 +18,7 @@ struct ReaderEntry: Codable, Identifiable, Hashable {
    let status: String
    let starred: Bool
    let readingTime: Int
    let thumbnail: String?
    let feed: ReaderFeedRef?
    var isRead: Bool { status == "read" }
@@ -49,7 +50,12 @@ struct ReaderEntry: Codable, Identifiable, Hashable {
    }
    var thumbnailURL: URL? {
-        ReaderEntry.extractThumbnail(from: content ?? fullContent ?? "")
+        // Prefer server-provided thumbnail
        if let thumb = thumbnail, !thumb.isEmpty, let url = URL(string: thumb) {
            return url
        }
        // Fallback: extract from content
        return ReaderEntry.extractThumbnail(from: content ?? fullContent ?? "")
    }
    private static let imgRegex = try! NSRegularExpression(
@@ -68,6 +74,9 @@ struct ReaderEntry: Codable, Identifiable, Hashable {
            return nil
        }
        let src = String(searchRange[srcRange])
            .replacingOccurrences(of: "&amp;", with: "&")
            .replacingOccurrences(of: "&lt;", with: "<")
            .replacingOccurrences(of: "&gt;", with: ">")
        return URL(string: src)
    }
--- a/services/reader/app/api/entries.py
+++ b/services/reader/app/api/entries.py
@@ -40,6 +40,7 @@ class EntryOut(BaseModel):
    status: str = "unread"
    starred: bool = False
    reading_time: int = 1
    thumbnail: str | None = None
    feed: FeedRef | None = None
    class Config:
@@ -49,6 +50,10 @@ class EntryOut(BaseModel):
    def from_entry(cls, entry: Entry) -> "EntryOut":
        # Use full_content if available, otherwise RSS content
        best_content = entry.full_content if entry.full_content else entry.content
        # Extract thumbnail from stored field, or from content
        thumb = entry.thumbnail
        if not thumb:
            thumb = cls._extract_thumbnail(entry.content or entry.full_content or "")
        return cls(
            id=entry.id,
            title=entry.title,
@@ -60,9 +65,25 @@ class EntryOut(BaseModel):
            status=entry.status,
            starred=entry.starred,
            reading_time=entry.reading_time,
            thumbnail=thumb,
            feed=FeedRef(id=entry.feed.id, title=entry.feed.title) if entry.feed else None,
        )
    @staticmethod
    def _extract_thumbnail(html: str) -> str | None:
        """Extract first image URL from HTML content."""
        if not html:
            return None
        import re
        match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', html[:3000], re.IGNORECASE)
        if match:
            url = match.group(1).replace("&amp;", "&")
            # Skip tiny tracking pixels and icons
            if any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]):
                return None
            return url
        return None
 class EntryListOut(BaseModel):
    total: int
--- a/services/reader/app/models.py
+++ b/services/reader/app/models.py
@@ -68,6 +68,7 @@ class Entry(Base):
    published_at = Column(DateTime)
    status = Column(String(10), default="unread")
    starred = Column(Boolean, default=False)
    thumbnail = Column(Text)
    reading_time = Column(Integer, default=1)
    created_at = Column(DateTime, default=datetime.utcnow)
--- a/services/reader/app/worker/tasks.py
+++ b/services/reader/app/worker/tasks.py
@@ -79,6 +79,31 @@ def _get_entry_content(entry: dict) -> str:
    return ""
 def _extract_thumbnail(entry: dict, content: str) -> str | None:
    """Extract thumbnail from feedparser entry or content HTML."""
    # 1. Check media:thumbnail
    for mt in entry.get("media_thumbnail", []):
        if mt.get("url"):
            return mt["url"]
    # 2. Check media:content with image type
    for mc in entry.get("media_content", []):
        if mc.get("medium") == "image" or (mc.get("type", "").startswith("image")):
            if mc.get("url"):
                return mc["url"]
    # 3. Check enclosures
    for enc in entry.get("enclosures", []):
        if enc.get("type", "").startswith("image") and enc.get("href"):
            return enc["href"]
    # 4. Extract from content HTML
    if content:
        match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', content[:3000], re.IGNORECASE)
        if match:
            url = match.group(1).replace("&amp;", "&")
            if not any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]):
                return url
    return None
 def _get_entry_author(entry: dict) -> str | None:
    """Extract author from a feedparser entry."""
    if entry.get("author"):
@@ -157,6 +182,7 @@ def fetch_single_feed(feed_id: int):
            content = _get_entry_content(fe)
            pub_date = _parse_date(fe)
            thumb = _extract_thumbnail(fe, content)
            stmt = pg_insert(Entry).values(
                feed_id=feed.id,
@@ -168,6 +194,7 @@ def fetch_single_feed(feed_id: int):
                published_at=pub_date,
                status="unread",
                starred=False,
                thumbnail=thumb,
                reading_time=_calc_reading_time(content),
            ).on_conflict_do_nothing(
                constraint="uq_reader_entries_feed_url"