跳到主要内容

多模态交互

多模态交互是指在与 LLM 交互时,不仅使用文本,还可以输入和输出图像、音频、视频等多种形式的内容。

支持的模态

输入模态

模态支持模型常见用途
文本所有模型对话、生成
图像GPT-4o, Claude 3, Gemini图像理解、OCR
音频GPT-4o, Gemini语音识别
视频Gemini视频理解
文件Claude, Gemini文档分析

输出模态

模态支持模型常见用途
文本所有模型回答、生成
图像DALL-E, FLUX, Midjourney图像生成
音频OpenAI TTS语音合成

图像理解

基本用法

from openai import OpenAI

client = OpenAI(api_key="your-api-key", base_url="https://api.weelinking.com/v1")

def analyze_image(image_url, question="请描述这张图片"):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {"url": image_url}
}
]
}
]
)
return response.choices[0].message.content

# 使用网络图片
result = analyze_image("https://example.com/image.jpg", "这张图片里有什么?")

Base64 图片

import base64

def encode_image(image_path):
with open(image_path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")

def analyze_local_image(image_path, question):
base64_image = encode_image(image_path)

# 根据文件类型确定 MIME 类型
if image_path.endswith(".png"):
mime_type = "image/png"
elif image_path.endswith(".gif"):
mime_type = "image/gif"
else:
mime_type = "image/jpeg"

response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{base64_image}"
}
}
]
}
]
)
return response.choices[0].message.content

多图分析

def compare_images(image_urls, question):
content = [{"type": "text", "text": question}]

for url in image_urls:
content.append({
"type": "image_url",
"image_url": {"url": url}
})

response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}]
)
return response.choices[0].message.content

# 比较两张图片
result = compare_images(
["https://example.com/before.jpg", "https://example.com/after.jpg"],
"请对比这两张图片的差异"
)

图像理解应用场景

1. OCR 文字识别

def ocr_image(image_url):
return analyze_image(
image_url,
"请识别图片中的所有文字,按原始格式输出"
)

2. 文档/表格分析

def analyze_document(image_url):
return analyze_image(
image_url,
"""分析这份文档:
1. 文档类型
2. 主要内容摘要
3. 如果有表格,提取表格数据
4. 关键数据和结论"""
)

3. 图表解读

def interpret_chart(image_url):
return analyze_image(
image_url,
"""请解读这个图表:
1. 图表类型
2. 横纵轴含义
3. 数据趋势
4. 关键洞察"""
)

4. 代码截图分析

def analyze_code_screenshot(image_url):
return analyze_image(
image_url,
"""请分析这段代码:
1. 识别编程语言
2. 解释代码功能
3. 指出潜在问题
4. 提供改进建议"""
)

5. 产品设计评审

def review_ui_design(image_url):
return analyze_image(
image_url,
"""作为 UI/UX 专家,请评审这个设计:
1. 整体视觉效果
2. 用户体验分析
3. 可用性问题
4. 改进建议"""
)

图像生成

使用 DALL-E

def generate_image(prompt, size="1024x1024", quality="standard"):
response = client.images.generate(
model="dall-e-3",
prompt=prompt,
size=size,
quality=quality,
n=1
)
return response.data[0].url

# 生成图片
image_url = generate_image(
"一只可爱的橘猫在阳光下打盹,水彩画风格",
size="1024x1024",
quality="hd"
)

图像编辑

def edit_image(original_image, mask_image, prompt):
with open(original_image, "rb") as img, open(mask_image, "rb") as mask:
response = client.images.edit(
model="dall-e-2",
image=img,
mask=mask,
prompt=prompt,
size="1024x1024"
)
return response.data[0].url

音频处理

语音转文字(STT)

def transcribe_audio(audio_file):
with open(audio_file, "rb") as f:
response = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language="zh"
)
return response.text

文字转语音(TTS)

def text_to_speech(text, voice="alloy", output_file="output.mp3"):
response = client.audio.speech.create(
model="tts-1",
voice=voice, # alloy, echo, fable, onyx, nova, shimmer
input=text
)

with open(output_file, "wb") as f:
f.write(response.content)

return output_file

视频理解

Gemini 视频分析

def analyze_video(video_url, question):
# Gemini 支持视频 URL
response = client.chat.completions.create(
model="gemini-2.5-flash",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "video_url",
"video_url": {"url": video_url}
}
]
}
]
)
return response.choices[0].message.content

多模态对话

连续多模态交互

class MultimodalChat:
def __init__(self):
self.messages = []

def add_text(self, text, role="user"):
self.messages.append({
"role": role,
"content": text
})

def add_image(self, image_url, text=""):
content = []
if text:
content.append({"type": "text", "text": text})
content.append({
"type": "image_url",
"image_url": {"url": image_url}
})
self.messages.append({
"role": "user",
"content": content
})

def chat(self, user_input):
if isinstance(user_input, str):
self.add_text(user_input)

response = client.chat.completions.create(
model="gpt-4o",
messages=self.messages
)

assistant_message = response.choices[0].message.content
self.add_text(assistant_message, role="assistant")

return assistant_message

# 使用
chat = MultimodalChat()
chat.add_image("https://example.com/chart.png", "这是销售数据图表")
response = chat.chat("分析这个图表的趋势")
follow_up = chat.chat("预测下个季度的销售额")

最佳实践

1. 图片质量

# 控制图片细节级别
{
"type": "image_url",
"image_url": {
"url": image_url,
"detail": "high" # low, high, auto
}
}

2. 图片压缩

from PIL import Image
import io

def compress_image(image_path, max_size=1024):
img = Image.open(image_path)

# 调整大小
ratio = min(max_size / img.width, max_size / img.height)
if ratio < 1:
new_size = (int(img.width * ratio), int(img.height * ratio))
img = img.resize(new_size)

# 压缩为 JPEG
buffer = io.BytesIO()
img.save(buffer, format="JPEG", quality=85)

return base64.b64encode(buffer.getvalue()).decode()

3. 成本控制

  • 使用 detail: low 降低 token 消耗
  • 压缩图片减少数据量
  • 只发送必要的图片区域

4. 错误处理

def safe_analyze_image(image_url, question):
try:
return analyze_image(image_url, question)
except Exception as e:
if "invalid_image" in str(e):
return "无法处理该图片格式"
elif "image_too_large" in str(e):
return "图片太大,请压缩后重试"
raise

注意事项

  1. 隐私安全: 不要发送包含敏感信息的图片
  2. 版权问题: 注意图片版权
  3. 内容审核: 某些图片可能被拒绝处理
  4. 成本意识: 图片分析比纯文本更贵