feat: thumbnail extraction for Reader — fixes all clients
All checks were successful
Security Checks / dockerfile-lint (push) Successful in 4s
Security Checks / dependency-audit (push) Successful in 13s
Security Checks / secret-scanning (push) Successful in 3s

Server-side (dashboard + iOS + any client):
- Added thumbnail column to reader_entries
- Worker extracts from media:thumbnail, media:content, enclosures, HTML img
- API returns thumbnail in EntryOut with & decoding
- Backfilled 260 existing entries

iOS:
- Prefers API thumbnail, falls back to client-side extraction
- Decodes HTML entities in URLs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Yusuf Suleman
2026-04-03 19:32:47 -05:00
parent 798ba17a93
commit a3eabf3e3b
4 changed files with 59 additions and 1 deletions

View File

@@ -18,6 +18,7 @@ struct ReaderEntry: Codable, Identifiable, Hashable {
let status: String let status: String
let starred: Bool let starred: Bool
let readingTime: Int let readingTime: Int
let thumbnail: String?
let feed: ReaderFeedRef? let feed: ReaderFeedRef?
var isRead: Bool { status == "read" } var isRead: Bool { status == "read" }
@@ -49,7 +50,12 @@ struct ReaderEntry: Codable, Identifiable, Hashable {
} }
var thumbnailURL: URL? { var thumbnailURL: URL? {
ReaderEntry.extractThumbnail(from: content ?? fullContent ?? "") // Prefer server-provided thumbnail
if let thumb = thumbnail, !thumb.isEmpty, let url = URL(string: thumb) {
return url
}
// Fallback: extract from content
return ReaderEntry.extractThumbnail(from: content ?? fullContent ?? "")
} }
private static let imgRegex = try! NSRegularExpression( private static let imgRegex = try! NSRegularExpression(
@@ -68,6 +74,9 @@ struct ReaderEntry: Codable, Identifiable, Hashable {
return nil return nil
} }
let src = String(searchRange[srcRange]) let src = String(searchRange[srcRange])
.replacingOccurrences(of: "&amp;", with: "&")
.replacingOccurrences(of: "&lt;", with: "<")
.replacingOccurrences(of: "&gt;", with: ">")
return URL(string: src) return URL(string: src)
} }

View File

@@ -40,6 +40,7 @@ class EntryOut(BaseModel):
status: str = "unread" status: str = "unread"
starred: bool = False starred: bool = False
reading_time: int = 1 reading_time: int = 1
thumbnail: str | None = None
feed: FeedRef | None = None feed: FeedRef | None = None
class Config: class Config:
@@ -49,6 +50,10 @@ class EntryOut(BaseModel):
def from_entry(cls, entry: Entry) -> "EntryOut": def from_entry(cls, entry: Entry) -> "EntryOut":
# Use full_content if available, otherwise RSS content # Use full_content if available, otherwise RSS content
best_content = entry.full_content if entry.full_content else entry.content best_content = entry.full_content if entry.full_content else entry.content
# Extract thumbnail from stored field, or from content
thumb = entry.thumbnail
if not thumb:
thumb = cls._extract_thumbnail(entry.content or entry.full_content or "")
return cls( return cls(
id=entry.id, id=entry.id,
title=entry.title, title=entry.title,
@@ -60,9 +65,25 @@ class EntryOut(BaseModel):
status=entry.status, status=entry.status,
starred=entry.starred, starred=entry.starred,
reading_time=entry.reading_time, reading_time=entry.reading_time,
thumbnail=thumb,
feed=FeedRef(id=entry.feed.id, title=entry.feed.title) if entry.feed else None, feed=FeedRef(id=entry.feed.id, title=entry.feed.title) if entry.feed else None,
) )
@staticmethod
def _extract_thumbnail(html: str) -> str | None:
"""Extract first image URL from HTML content."""
if not html:
return None
import re
match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', html[:3000], re.IGNORECASE)
if match:
url = match.group(1).replace("&amp;", "&")
# Skip tiny tracking pixels and icons
if any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]):
return None
return url
return None
class EntryListOut(BaseModel): class EntryListOut(BaseModel):
total: int total: int

View File

@@ -68,6 +68,7 @@ class Entry(Base):
published_at = Column(DateTime) published_at = Column(DateTime)
status = Column(String(10), default="unread") status = Column(String(10), default="unread")
starred = Column(Boolean, default=False) starred = Column(Boolean, default=False)
thumbnail = Column(Text)
reading_time = Column(Integer, default=1) reading_time = Column(Integer, default=1)
created_at = Column(DateTime, default=datetime.utcnow) created_at = Column(DateTime, default=datetime.utcnow)

View File

@@ -79,6 +79,31 @@ def _get_entry_content(entry: dict) -> str:
return "" return ""
def _extract_thumbnail(entry: dict, content: str) -> str | None:
"""Extract thumbnail from feedparser entry or content HTML."""
# 1. Check media:thumbnail
for mt in entry.get("media_thumbnail", []):
if mt.get("url"):
return mt["url"]
# 2. Check media:content with image type
for mc in entry.get("media_content", []):
if mc.get("medium") == "image" or (mc.get("type", "").startswith("image")):
if mc.get("url"):
return mc["url"]
# 3. Check enclosures
for enc in entry.get("enclosures", []):
if enc.get("type", "").startswith("image") and enc.get("href"):
return enc["href"]
# 4. Extract from content HTML
if content:
match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', content[:3000], re.IGNORECASE)
if match:
url = match.group(1).replace("&amp;", "&")
if not any(skip in url.lower() for skip in ["1x1", "pixel", "tracking", "spacer"]):
return url
return None
def _get_entry_author(entry: dict) -> str | None: def _get_entry_author(entry: dict) -> str | None:
"""Extract author from a feedparser entry.""" """Extract author from a feedparser entry."""
if entry.get("author"): if entry.get("author"):
@@ -157,6 +182,7 @@ def fetch_single_feed(feed_id: int):
content = _get_entry_content(fe) content = _get_entry_content(fe)
pub_date = _parse_date(fe) pub_date = _parse_date(fe)
thumb = _extract_thumbnail(fe, content)
stmt = pg_insert(Entry).values( stmt = pg_insert(Entry).values(
feed_id=feed.id, feed_id=feed.id,
@@ -168,6 +194,7 @@ def fetch_single_feed(feed_id: int):
published_at=pub_date, published_at=pub_date,
status="unread", status="unread",
starred=False, starred=False,
thumbnail=thumb,
reading_time=_calc_reading_time(content), reading_time=_calc_reading_time(content),
).on_conflict_do_nothing( ).on_conflict_do_nothing(
constraint="uq_reader_entries_feed_url" constraint="uq_reader_entries_feed_url"