feat: brain PDF/image text extraction — pymupdf + tesseract OCR + vision API

- PDF: extracts selectable text via pymupdf, falls back to Tesseract OCR for scanned docs - PDF: renders first page as screenshot thumbnail - Images: Tesseract OCR for text extraction, OpenAI vision API fallback for photos - Plain text files: direct decode - All extracted text stored in extracted_text field for search/embedding - Tested: PDF upload → text extracted → AI classified → searchable New deps: pymupdf, pytesseract, Pillow System dep: tesseract-ocr added to both Dockerfiles Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-01 18:49:04 -05:00
parent 2c3f0d263b
commit b179386a57
6 changed files with 286 additions and 2 deletions
--- a/services/brain/app/services/extract.py
+++ b/services/brain/app/services/extract.py
@@ -0,0 +1,202 @@
+"""Document and image text extraction — PDF parsing, OCR, vision API."""
+
+import io
+import logging
+from pathlib import Path
+
+import httpx
+
+from app.config import OPENAI_API_KEY, OPENAI_MODEL
+
+log = logging.getLogger(__name__)
+
+
+def extract_pdf_text(pdf_bytes: bytes) -> dict:
+    """Extract text from a PDF. Returns {text, page_count, needs_ocr, pages_text}."""
+    import fitz  # pymupdf
+
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    pages_text = []
+    full_text = []
+
+    for page in doc:
+        text = page.get_text("text").strip()
+        pages_text.append(text)
+        full_text.append(text)
+
+    doc.close()
+    combined = "\n\n".join(full_text).strip()
+
+    # If very little text extracted, it's probably a scanned PDF
+    needs_ocr = len(combined) < 50 and len(pages_text) > 0
+
+    return {
+        "text": combined,
+        "page_count": len(pages_text),
+        "needs_ocr": needs_ocr,
+        "pages_text": pages_text,
+    }
+
+
+def render_pdf_first_page(pdf_bytes: bytes) -> bytes | None:
+    """Render the first page of a PDF as a PNG image. Returns PNG bytes."""
+    import fitz
+
+    try:
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        if len(doc) == 0:
+            return None
+        page = doc[0]
+        # Render at 2x resolution for quality
+        mat = fitz.Matrix(2, 2)
+        pix = page.get_pixmap(matrix=mat)
+        png_bytes = pix.tobytes("png")
+        doc.close()
+        return png_bytes
+    except Exception as e:
+        log.error(f"Failed to render PDF page: {e}")
+        return None
+
+
+def render_pdf_pages(pdf_bytes: bytes, max_pages: int = 5) -> list[bytes]:
+    """Render multiple PDF pages as PNG images."""
+    import fitz
+
+    images = []
+    try:
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        for i, page in enumerate(doc):
+            if i >= max_pages:
+                break
+            mat = fitz.Matrix(2, 2)
+            pix = page.get_pixmap(matrix=mat)
+            images.append(pix.tobytes("png"))
+        doc.close()
+    except Exception as e:
+        log.error(f"Failed to render PDF pages: {e}")
+    return images
+
+
+def ocr_image(image_bytes: bytes) -> str:
+    """Run Tesseract OCR on an image. Returns extracted text."""
+    import pytesseract
+    from PIL import Image
+
+    try:
+        img = Image.open(io.BytesIO(image_bytes))
+        text = pytesseract.image_to_string(img)
+        return text.strip()
+    except Exception as e:
+        log.error(f"OCR failed: {e}")
+        return ""
+
+
+def ocr_pdf(pdf_bytes: bytes) -> str:
+    """OCR a scanned PDF by rendering pages to images then running Tesseract."""
+    pages = render_pdf_pages(pdf_bytes, max_pages=10)
+    all_text = []
+    for i, page_img in enumerate(pages):
+        log.info(f"OCR page {i + 1}/{len(pages)}")
+        text = ocr_image(page_img)
+        if text:
+            all_text.append(text)
+    return "\n\n".join(all_text)
+
+
+async def describe_image_with_vision(image_bytes: bytes, content_type: str = "image/png") -> str:
+    """Use OpenAI Vision API to describe an image. Returns description text."""
+    if not OPENAI_API_KEY:
+        log.warning("No OPENAI_API_KEY, skipping vision description")
+        return ""
+
+    import base64
+    b64 = base64.b64encode(image_bytes).decode("utf-8")
+
+    try:
+        async with httpx.AsyncClient(timeout=30) as client:
+            resp = await client.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
+                json={
+                    "model": OPENAI_MODEL,
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "Describe this image in detail. Extract ALL text visible in the image. If it's a document, receipt, form, or screenshot, transcribe the text content. If it's a photo, describe what you see."
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": f"data:{content_type};base64,{b64}",
+                                        "detail": "high"
+                                    }
+                                }
+                            ]
+                        }
+                    ],
+                    "max_tokens": 2000,
+                },
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            return data["choices"][0]["message"]["content"].strip()
+    except Exception as e:
+        log.error(f"Vision API failed: {e}")
+        return ""
+
+
+def extract_text_from_file(file_bytes: bytes, content_type: str, filename: str) -> dict:
+    """Extract text from any supported file type.
+
+    Returns {text, method, page_count, screenshot_png}.
+    """
+    ct = content_type.lower()
+    fname = filename.lower()
+
+    # PDF
+    if ct == "application/pdf" or fname.endswith(".pdf"):
+        result = extract_pdf_text(file_bytes)
+        text = result["text"]
+        method = "pdf_text"
+
+        # If scanned (no text), try OCR
+        if result["needs_ocr"]:
+            log.info("PDF has no selectable text, running OCR...")
+            text = ocr_pdf(file_bytes)
+            method = "pdf_ocr"
+
+        screenshot = render_pdf_first_page(file_bytes)
+
+        return {
+            "text": text,
+            "method": method,
+            "page_count": result["page_count"],
+            "screenshot_png": screenshot,
+        }
+
+    # Images
+    if ct.startswith("image/"):
+        # Try OCR first
+        ocr_text = ocr_image(file_bytes)
+        method = "image_ocr"
+
+        return {
+            "text": ocr_text,
+            "method": method,
+            "page_count": None,
+            "screenshot_png": None,  # The image itself is the "screenshot"
+        }
+
+    # Plain text files
+    if ct.startswith("text/") or fname.endswith((".txt", ".md", ".csv")):
+        try:
+            text = file_bytes.decode("utf-8")
+            return {"text": text, "method": "text_decode", "page_count": None, "screenshot_png": None}
+        except UnicodeDecodeError:
+            return {"text": "", "method": "decode_failed", "page_count": None, "screenshot_png": None}
+
+    # Unsupported
+    return {"text": "", "method": "unsupported", "page_count": None, "screenshot_png": None}
--- a/services/brain/app/worker/tasks.py
+++ b/services/brain/app/worker/tasks.py
@@ -99,6 +99,66 @@ async def _process_item(item_id: str):
                        )
                        db.add(asset)

+            # ── Step 1b: Process uploaded files (PDF, image, document) ──
+            if item.type in ("pdf", "image", "document", "file"):
+                from app.services.extract import extract_text_from_file, describe_image_with_vision
+                from app.services.storage import storage as file_storage
+
+                # Find the original upload asset
+                upload_asset = None
+                for a in item.assets:
+                    if a.asset_type == "original_upload":
+                        upload_asset = a
+                        break
+
+                if upload_asset and file_storage.exists(upload_asset.storage_path):
+                    log.info(f"Extracting text from {item.type}: {upload_asset.filename}")
+                    file_bytes = file_storage.read(upload_asset.storage_path)
+                    result = extract_text_from_file(
+                        file_bytes,
+                        upload_asset.content_type or "application/octet-stream",
+                        upload_asset.filename,
+                    )
+
+                    if result["text"]:
+                        extracted_text = result["text"]
+                        log.info(f"Extracted {len(extracted_text)} chars via {result['method']}")
+
+                    # Save PDF screenshot as an asset
+                    if result.get("screenshot_png"):
+                        from app.services.storage import storage
+                        path = storage.save(
+                            item_id=item.id,
+                            asset_type="screenshot",
+                            filename="screenshot.png",
+                            data=result["screenshot_png"],
+                        )
+                        asset = ItemAsset(
+                            id=str(uuid.uuid4()),
+                            item_id=item.id,
+                            asset_type="screenshot",
+                            filename="screenshot.png",
+                            content_type="image/png",
+                            storage_path=path,
+                        )
+                        db.add(asset)
+
+                    # For images with little OCR text, try vision API for description
+                    if item.type == "image" and len(extracted_text) < 50:
+                        log.info("Image has little OCR text, trying vision API...")
+                        vision_text = await describe_image_with_vision(
+                            file_bytes,
+                            upload_asset.content_type or "image/png",
+                        )
+                        if vision_text:
+                            extracted_text = vision_text
+                            log.info(f"Vision API returned {len(vision_text)} chars")
+
+                    item.metadata_json = item.metadata_json or {}
+                    item.metadata_json["extraction_method"] = result["method"]
+                    if result.get("page_count"):
+                        item.metadata_json["page_count"] = result["page_count"]
+
            # ── Step 2: AI classification ──
            log.info(f"Classifying item {item.id}")
            classification = await classify_item(