toMedia
Claude API

视觉与 PDF 处理

Claude 的多模态能力 — 图片输入、PDF 分析与 Files API

图片输入方式

Base64 编码

import anthropic
import base64
import requests

client = anthropic.Anthropic()

# 下载并编码图片
image_url = "https://example.com/photo.jpg"
image_data = base64.standard_b64encode(
    requests.get(image_url).content
).decode("utf-8")

message = client.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/jpeg",
                    "data": image_data,
                },
            },
            {"type": "text", "text": "详细描述这张图片。"},
        ],
    }],
)
print(message.content[0].text)

URL 引用

message = client.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "image",
                "source": {
                    "type": "url",
                    "url": "https://example.com/photo.jpg",
                },
            },
            {"type": "text", "text": "这张图片里有什么?"},
        ],
    }],
)

支持的图片格式

格式MIME 类型
JPEGimage/jpeg
PNGimage/png
GIFimage/gif
WebPimage/webp

限制:单张图片最大 5 MB,最大分辨率 8000×8000 像素。最佳尺寸为 1092×1092 px。

多图对比

message = client.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    messages=[{
        "role": "user",
        "content": [
            {"type": "text", "text": "图片 1:"},
            {"type": "image", "source": {"type": "url", "url": "https://example.com/img1.jpg"}},
            {"type": "text", "text": "图片 2:"},
            {"type": "image", "source": {"type": "url", "url": "https://example.com/img2.jpg"}},
            {"type": "text", "text": "这两张图片有什么区别?"},
        ],
    }],
)

PDF 处理

从 URL 分析 PDF

message = client.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "document",
                "source": {
                    "type": "url",
                    "url": "https://example.com/report.pdf",
                },
            },
            {"type": "text", "text": "这份文档的主要发现是什么?"},
        ],
    }],
)

本地 PDF(Base64)

import base64

with open("document.pdf", "rb") as f:
    pdf_data = base64.standard_b64encode(f.read()).decode("utf-8")

message = client.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "document",
                "source": {
                    "type": "base64",
                    "media_type": "application/pdf",
                    "data": pdf_data,
                },
            },
            {"type": "text", "text": "总结这份文档"},
        ],
    }],
)

Files API(重复使用)

上传一次,多次查询:

# 上传 PDF
with open("document.pdf", "rb") as f:
    file_upload = client.beta.files.upload(
        file=("document.pdf", f, "application/pdf"),
        betas=["files-api-2025-04-14"]
    )

# 多次查询同一文件
for query in ["总结", "提取关键数字", "列出行动项"]:
    message = client.beta.messages.create(
        model="claude-opus-4-6",
        max_tokens=1024,
        betas=["files-api-2025-04-14"],
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "document",
                    "source": {"type": "file", "file_id": file_upload.id},
                },
                {"type": "text", "text": query},
            ],
        }],
    )
    print(f"{query}: {message.content[0].text}\n")

PDF + Prompt Caching

# 第一次请求创建缓存
message = client.beta.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    betas=["files-api-2025-04-14"],
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "document",
                "source": {"type": "file", "file_id": file_upload.id},
                "cache_control": {"type": "ephemeral"}
            },
            {"type": "text", "text": "主题是什么?"},
        ],
    }],
)
print(f"缓存创建 tokens: {message.usage.cache_creation_input_tokens}")

# 第二次请求使用缓存(便宜 90%)
message2 = client.beta.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    betas=["files-api-2025-04-14"],
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "document",
                "source": {"type": "file", "file_id": file_upload.id},
                "cache_control": {"type": "ephemeral"}
            },
            {"type": "text", "text": "用 3 句话总结"},
        ],
    }],
)
print(f"缓存读取 tokens: {message2.usage.cache_read_input_tokens}")

最佳实践

  1. 图片放前面:在 prompt 中将图片放在文字之前,效果更好
  2. 合理尺寸:最长边保持在 1568 像素以内
  3. Files API 复用:多次处理同一文件时使用 Files API
  4. Token 监控:大量图片时注意总 token 用量
  5. 格式兼容:处理不支持格式或超大文件的异常

On this page