platform/services/brain/migrate_karakeep.py

"""Migrate all bookmarks from Karakeep into Brain service via API."""

import json
import os
import sys
import time
import urllib.request
import urllib.error
import tempfile

KARAKEEP_URL = os.environ.get("KARAKEEP_URL", "http://192.168.1.42:3005")
KARAKEEP_API_KEY = os.environ.get("KARAKEEP_API_KEY", "ak2_f4141e5fe7265e23bd6f_4549c932c262010eafd08acb2139f1ac")
BRAIN_URL = "http://localhost:8200"
BRAIN_USER = "admin"


def karakeep_get(path):
    req = urllib.request.Request(
        f"{KARAKEEP_URL}{path}",
        headers={"Authorization": f"Bearer {KARAKEEP_API_KEY}"},
    )
    return json.loads(urllib.request.urlopen(req, timeout=30).read())


def karakeep_download(asset_id):
    req = urllib.request.Request(
        f"{KARAKEEP_URL}/api/v1/assets/{asset_id}",
        headers={"Authorization": f"Bearer {KARAKEEP_API_KEY}"},
    )
    resp = urllib.request.urlopen(req, timeout=120)
    return resp.read(), resp.headers.get("Content-Type", "application/octet-stream")


def brain_post_json(path, data):
    body = json.dumps(data).encode()
    req = urllib.request.Request(
        f"{BRAIN_URL}/api{path}",
        data=body,
        headers={"X-Gateway-User-Id": BRAIN_USER, "Content-Type": "application/json"},
        method="POST",
    )
    resp = urllib.request.urlopen(req, timeout=30)
    return json.loads(resp.read())


def brain_upload(file_data, filename, content_type, title=None):
    """Multipart upload to /api/items/upload."""
    boundary = "----MigrationBoundary12345"
    parts = []

    # File part
    parts.append(f"--{boundary}\r\n".encode())
    parts.append(f'Content-Disposition: form-data; name="file"; filename="{filename}"\r\n'.encode())
    parts.append(f"Content-Type: {content_type}\r\n\r\n".encode())
    parts.append(file_data)
    parts.append(b"\r\n")

    # Title part
    if title:
        parts.append(f"--{boundary}\r\n".encode())
        parts.append(b'Content-Disposition: form-data; name="title"\r\n\r\n')
        parts.append(title.encode())
        parts.append(b"\r\n")

    parts.append(f"--{boundary}--\r\n".encode())
    body = b"".join(parts)

    req = urllib.request.Request(
        f"{BRAIN_URL}/api/items/upload",
        data=body,
        headers={
            "X-Gateway-User-Id": BRAIN_USER,
            "Content-Type": f"multipart/form-data; boundary={boundary}",
        },
        method="POST",
    )
    resp = urllib.request.urlopen(req, timeout=60)
    return json.loads(resp.read())


def brain_get_item(item_id):
    req = urllib.request.Request(
        f"{BRAIN_URL}/api/items/{item_id}",
        headers={"X-Gateway-User-Id": BRAIN_USER},
    )
    resp = urllib.request.urlopen(req, timeout=15)
    return json.loads(resp.read())


def fetch_all_bookmarks():
    all_bk = []
    cursor = None
    while True:
        url = "/api/v1/bookmarks?limit=100"
        if cursor:
            url += f"&cursor={cursor}"
        data = karakeep_get(url)
        bks = data.get("bookmarks", [])
        all_bk.extend(bks)
        cursor = data.get("nextCursor")
        if not cursor or not bks:
            break
    return all_bk


def wait_for_processing(item_id, timeout=120):
    """Poll until item is done processing."""
    start = time.time()
    while time.time() - start < timeout:
        item = brain_get_item(item_id)
        status = item.get("processing_status", "pending")
        if status in ("ready", "error"):
            return item
        time.sleep(3)
    return brain_get_item(item_id)


def main():
    print("Fetching all Karakeep bookmarks...")
    bookmarks = fetch_all_bookmarks()
    print(f"Found {len(bookmarks)} bookmarks\n")

    # Sort: notes first, then links, then assets (PDFs take longer)
    def sort_key(b):
        t = b.get("content", {}).get("type", "")
        return {"text": 0, "link": 1, "asset": 2}.get(t, 3)
    bookmarks.sort(key=sort_key)

    results = {"success": 0, "error": 0, "skipped": 0}
    comparison = []

    for i, bk in enumerate(bookmarks):
        content = bk.get("content", {})
        bk_type = content.get("type", "unknown")
        bk_title = bk.get("title") or "Untitled"
        bk_tags = [t["name"] for t in bk.get("tags", [])]
        bk_list = bk.get("list", {})
        bk_folder = bk_list.get("name") if bk_list else None

        print(f"[{i+1}/{len(bookmarks)}] {bk_type}: {bk_title[:60]}")

        try:
            if bk_type == "link":
                url = content.get("url", "")
                if not url:
                    print("  SKIP: no URL")
                    results["skipped"] += 1
                    continue
                resp = brain_post_json("/items", {
                    "type": "link",
                    "url": url,
                    "title": bk_title if bk_title != "Untitled" else None,
                })

            elif bk_type == "text":
                text = content.get("text", "")
                if not text:
                    print("  SKIP: no text")
                    results["skipped"] += 1
                    continue
                resp = brain_post_json("/items", {
                    "type": "note",
                    "raw_content": text,
                    "title": bk_title if bk_title != "Untitled" else None,
                })

            elif bk_type == "asset":
                asset_id = content.get("assetId")
                asset_type = content.get("assetType", "unknown")
                if not asset_id:
                    print("  SKIP: no assetId")
                    results["skipped"] += 1
                    continue

                print(f"  Downloading {asset_type} ({asset_id[:8]})...")
                file_data, ct = karakeep_download(asset_id)
                ext = {"pdf": ".pdf", "image": ".png"}.get(asset_type, ".bin")
                filename = f"{bk_title[:50]}{ext}" if bk_title != "Untitled" else f"upload{ext}"
                # Clean filename
                filename = filename.replace("/", "-").replace("\\", "-")
                if asset_type == "pdf":
                    ct = "application/pdf"
                resp = brain_upload(file_data, filename, ct, title=bk_title if bk_title != "Untitled" else None)
            else:
                print(f"  SKIP: unknown type '{bk_type}'")
                results["skipped"] += 1
                continue

            item_id = resp.get("id")
            print(f"  Created: {item_id} — waiting for AI classification...")

            # Wait for processing
            final = wait_for_processing(item_id, timeout=90)
            status = final.get("processing_status", "?")
            ai_folder = final.get("folder", "?")
            ai_tags = final.get("tags", [])
            ai_title = final.get("title", "?")

            # Compare
            entry = {
                "karakeep_title": bk_title,
                "karakeep_tags": bk_tags,
                "karakeep_folder": bk_folder,
                "ai_title": ai_title,
                "ai_folder": ai_folder,
                "ai_tags": ai_tags,
                "status": status,
            }
            comparison.append(entry)

            tag_match = "OK" if set(bk_tags) & set(ai_tags) or (not bk_tags and not ai_tags) else "DIFF"

            print(f"  Status: {status}")
            print(f"  AI Folder: {ai_folder} (Karakeep: {bk_folder or 'none'})")
            print(f"  AI Tags: {ai_tags} vs Karakeep: {bk_tags} [{tag_match}]")
            print(f"  AI Title: {ai_title}")

            results["success"] += 1

        except Exception as e:
            print(f"  ERROR: {e}")
            results["error"] += 1

        print()

    # Summary
    print("=" * 60)
    print(f"MIGRATION COMPLETE")
    print(f"  Success: {results['success']}")
    print(f"  Errors:  {results['error']}")
    print(f"  Skipped: {results['skipped']}")
    print()

    # Tag comparison summary
    matches = 0
    diffs = 0
    for c in comparison:
        kk = set(c["karakeep_tags"])
        ai = set(c["ai_tags"])
        if kk & ai or (not kk and not ai):
            matches += 1
        else:
            diffs += 1
    print(f"Tag overlap: {matches}/{len(comparison)} items had at least one matching tag")
    print(f"Tag differences: {diffs}/{len(comparison)} items had zero overlap")

    # Save comparison
    with open("/tmp/migration_comparison.json", "w") as f:
        json.dump(comparison, f, indent=2)
    print("\nFull comparison saved to /tmp/migration_comparison.json")


if __name__ == "__main__":
    main()