From a3eabf3e3b80b34aa084ad525eab52c1b6946dec Mon Sep 17 00:00:00 2001
From: Yusuf Suleman <yusuf@quadjourney.com>
Date: Fri, 3 Apr 2026 19:32:47 -0500
Subject: [PATCH] =?UTF-8?q?feat:=20thumbnail=20extraction=20for=20Reader?=
 =?UTF-8?q?=20=E2=80=94=20fixes=20all=20clients?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Server-side (dashboard + iOS + any client):
- Added thumbnail column to reader_entries
- Worker extracts from media:thumbnail, media:content, enclosures, HTML img
- API returns thumbnail in EntryOut with &amp; decoding
- Backfilled 260 existing entries

iOS:
- Prefers API thumbnail, falls back to client-side extraction
- Decodes HTML entities in URLs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../Features/Reader/Models/ReaderModels.swift | 11 +++++++-
 services/reader/app/api/entries.py            | 21 +++++++++++++++
 services/reader/app/models.py                 |  1 +
 services/reader/app/worker/tasks.py           | 27 +++++++++++++++++++
 4 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift b/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift
index 699d877..167d5f2 100644
--- a/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift
+++ b/ios/Platform/Platform/Features/Reader/Models/ReaderModels.swift
@@ -18,6 +18,7 @@ struct ReaderEntry: Codable, Identifiable, Hashable {
     let status: String
     let starred: Bool
     let readingTime: Int
+    let thumbnail: String?
     let feed: ReaderFeedRef?
 
     var isRead: Bool { status == "read" }
@@ -49,7 +50,12 @@ struct ReaderEntry: Codable, Identifiable, Hashable {
     }
 
     var thumbnailURL: URL? {
-        ReaderEntry.extractThumbnail(from: content ?? fullContent ?? "")
+        // Prefer server-provided thumbnail
+        if let thumb = thumbnail, !thumb.isEmpty, let url = URL(string: thumb) {
+            return url
+        }
+        // Fallback: extract from content
+        return ReaderEntry.extractThumbnail(from: content ?? fullContent ?? "")
     }
 
     private static let imgRegex = try! NSRegularExpression(
@@ -68,6 +74,9 @@ struct ReaderEntry: Codable, Identifiable, Hashable {
             return nil
         }
         let src = String(searchRange[srcRange])
+            .replacingOccurrences(of: "&amp;", with: "&")
+            .replacingOccurrences(of: "&lt;", with: "<")
+            .replacingOccurrences(of: "&gt;", with: ">")
         return URL(string: src)
     }
 
diff --git a/services/reader/app/api/entries.py b/services/reader/app/api/entries.py
index b408869..be3ea9e 100644
--- a/services/reader/app/api/entries.py
+++ b/services/reader/app/api/entries.py
@@ -40,6 +40,7 @@ class EntryOut(BaseModel):
     status: str = "unread"
     starred: bool = False
     reading_time: int = 1
+    thumbnail: str | None = None
     feed: FeedRef | None = None
 
     class Config:
@@ -49,6 +50,10 @@ class EntryOut(BaseModel):
     def from_entry(cls, entry: Entry) -> "EntryOut":
         # Use full_content if available, otherwise RSS content
         best_content = entry.full_content if entry.full_content else entry.content
+        # Extract thumbnail from stored field, or from content
+        thumb = entry.thumbnail
+        if not thumb:
+            thumb = cls._extract_thumbnail(entry.content or entry.full_content or "")
         return cls(
             id=entry.id,
             title=entry.title,
@@ -60,9 +65,25 @@ class EntryOut(BaseModel):
             status=entry.status,
             starred=entry.starred,
             reading_time=entry.reading_time,
+            thumbnail=thumb,
             feed=FeedRef(id=entry.feed.id, title=entry.feed.title) if entry.feed else None,
         )
 
+    @staticmethod
+    def _extract_thumbnail(html: str) -> str | None:
+        """Extract first image URL from HTML content."""
+        if not html:
+            return None
+        import re
+        match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', html[:3000], re.IGNORECASE)
+        if match:
+            url = match.group(1).replace("&amp;", "&")
+            # Skip tiny tracking pixels and icons
+            if any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]):
+                return None
+            return url
+        return None
+
 
 class EntryListOut(BaseModel):
     total: int
diff --git a/services/reader/app/models.py b/services/reader/app/models.py
index 2aa6f35..9b9258e 100644
--- a/services/reader/app/models.py
+++ b/services/reader/app/models.py
@@ -68,6 +68,7 @@ class Entry(Base):
     published_at = Column(DateTime)
     status = Column(String(10), default="unread")
     starred = Column(Boolean, default=False)
+    thumbnail = Column(Text)
     reading_time = Column(Integer, default=1)
     created_at = Column(DateTime, default=datetime.utcnow)
 
diff --git a/services/reader/app/worker/tasks.py b/services/reader/app/worker/tasks.py
index ea92652..cb3af37 100644
--- a/services/reader/app/worker/tasks.py
+++ b/services/reader/app/worker/tasks.py
@@ -79,6 +79,31 @@ def _get_entry_content(entry: dict) -> str:
     return ""
 
 
+def _extract_thumbnail(entry: dict, content: str) -> str | None:
+    """Extract thumbnail from feedparser entry or content HTML."""
+    # 1. Check media:thumbnail
+    for mt in entry.get("media_thumbnail", []):
+        if mt.get("url"):
+            return mt["url"]
+    # 2. Check media:content with image type
+    for mc in entry.get("media_content", []):
+        if mc.get("medium") == "image" or (mc.get("type", "").startswith("image")):
+            if mc.get("url"):
+                return mc["url"]
+    # 3. Check enclosures
+    for enc in entry.get("enclosures", []):
+        if enc.get("type", "").startswith("image") and enc.get("href"):
+            return enc["href"]
+    # 4. Extract from content HTML
+    if content:
+        match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', content[:3000], re.IGNORECASE)
+        if match:
+            url = match.group(1).replace("&amp;", "&")
+            if not any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]):
+                return url
+    return None
+
+
 def _get_entry_author(entry: dict) -> str | None:
     """Extract author from a feedparser entry."""
     if entry.get("author"):
@@ -157,6 +182,7 @@ def fetch_single_feed(feed_id: int):
 
             content = _get_entry_content(fe)
             pub_date = _parse_date(fe)
+            thumb = _extract_thumbnail(fe, content)
 
             stmt = pg_insert(Entry).values(
                 feed_id=feed.id,
@@ -168,6 +194,7 @@ def fetch_single_feed(feed_id: int):
                 published_at=pub_date,
                 status="unread",
                 starred=False,
+                thumbnail=thumb,
                 reading_time=_calc_reading_time(content),
             ).on_conflict_do_nothing(
                 constraint="uq_reader_entries_feed_url"