一、多模态AI概述
多模态AI(Multimodal AI)指能够同时理解和生成多种类型数据(文本、图像、音频、视频)的AI模型。相比纯文本模型,多模态模型能完成图像理解、视觉问答、语音交互等任务,被认为是AI从”能说”到”能看能听”的关键一步。
二、主流多模态模型对比
模型 图像理解 语音处理 视频理解 API费用 开源
----------------------------------------------------------------------------
GPT-4o 高 中 低 中 否
Claude 3.5 Sonnet 高 低 低 中 否
Gemini 1.5 Pro 高 高 高 低 否
Qwen-VL 高 低 低 低 是
InternVL 高 低 低 低 是
LLaVA 中 低 低 免费 是
三、GPT-4o 图像理解实战
3.1 核心代码
@Service
@Slf4j
public class MultimodalService {
@Autowired
private RestTemplate restTemplate;
@Value("${openai.api.key}")
private String apiKey;
private static final String VISION_URL = "https://api.openai.com/v1/chat/completions";
public String analyzeImage(String imageUrl, String question) {
Map<String, Object> body = new HashMap<>();
body.put("model", "gpt-4o");
body.put("max_tokens", 1000);
Map<String, String> textMsg = new HashMap<>();
textMsg.put("role", "user");
textMsg.put("content", List.of(
Map.of("type", "text", "text", question),
Map.of("type", "image_url", "image_url",
Map.of("url", imageUrl, "detail", "high"))
));
body.put("messages", List.of(textMsg));
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
headers.setBearerAuth(apiKey);
HttpEntity<Map<String, Object>> entity = new HttpEntity<>(body, headers);
ResponseEntity<Map> resp = restTemplate.exchange(
VISION_URL, HttpMethod.POST, entity, Map.class);
Map result = resp.getBody();
List choices = (List) result.get("choices");
Map firstChoice = (Map) choices.get(0);
Map message = (Map) firstChoice.get("message");
return (String) message.get("content");
}
}
3.2 Controller接口
@RestController
@RequestMapping("/api/multimodal")
public class MultimodalController {
@Autowired
private MultimodalService multimodalService;
@PostMapping("/analyze-image")
public Map<String, String> analyzeImage(
@RequestParam String imageUrl,
@RequestParam(defaultValue = "描述这张图片") String question) {
String result = multimodalService.analyzeImage(imageUrl, question);
return Map.of("result", result);
}
// 前端上传图片接口
@PostMapping("/upload-analyze")
public Map<String, Object> uploadAndAnalyze(
@RequestParam("file") MultipartFile file,
@RequestParam String question) throws IOException {
// 图片转Base64
String base64 = Base64.getEncoder().encodeToString(file.getBytes());
String mimeType = file.getContentType();
String dataUri = "data:" + mimeType + ";base64," + base64;
String result = multimodalService.analyzeImage(dataUri, question);
return Map.of("result", result, "filename", file.getOriginalFilename());
}
}
四、Claude 3.5 Vision 图像问答
@Service
public class ClaudeVisionService {
@Value("${anthropic.api.key}")
private String apiKey;
private static final String CLAUDE_URL = "https://api.anthropic.com/v1/messages";
public String askAboutImage(String imageUrl, String question) {
Map<String, Object> body = new HashMap<>();
body.put("model", "claude-3-5-sonnet-20241022");
body.put("max_tokens", 1024);
body.put("messages", List.of(
Map.of("role", "user", "content", List.of(
Map.of("type", "image", "source",
Map.of("type", "url", "media_type", "image/jpeg", "data", imageUrl)),
Map.of("type", "text", "text", question)
))
));
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
headers.set("x-api-key", apiKey);
headers.set("anthropic-version", "2023-06-01");
HttpEntity<Map<String, Object>> entity = new HttpEntity<>(body, headers);
ResponseEntity<Map> resp = new RestTemplate().exchange(
CLAUDE_URL, HttpMethod.POST, entity, Map.class);
Map result = resp.getBody();
List content = (List) result.get("content");
Map textBlock = (Map) content.get(0);
return (String) textBlock.get("text");
}
}
五、语音交互(Whisper + TTS)
5.1 语音转文字
@Service
public class WhisperService {
@Value("${openai.api.key}")
private String apiKey;
public String transcribe(MultipartFile audioFile) throws IOException {
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.MULTIPART_FORM_DATA);
headers.setBearerAuth(apiKey);
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
body.add("file", new HttpResource(audioFile.getResource(), audioFile.getContentType()));
body.add("model", "whisper-1");
body.add("language", "zh");
HttpEntity<MultiValueMap<String, Object>> entity = new HttpEntity<>(body, headers);
ResponseEntity<Map> resp = new RestTemplate().exchange(
"https://api.openai.com/v1/audio/transcriptions", HttpMethod.POST, entity, Map.class);
return (String) resp.getBody().get("text");
}
}
5.2 文字转语音
@Service
public class TTSService {
@Value("${openai.api.key}")
private String apiKey;
public byte[] textToSpeech(String text) {
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
headers.setBearerAuth(apiKey);
Map<String, Object> body = Map.of(
"model", "tts-1",
"input", text,
"voice", "alloy"
);
HttpEntity<Map<String, Object>> entity = new HttpEntity<>(body, headers);
ResponseEntity<byte[]> resp = new RestTemplate().exchange(
"https://api.openai.com/v1/audio/speech", HttpMethod.POST, entity, byte[].class);
return resp.getBody();
}
}
六、实战:智能客服多模态系统
@Service
public class SmartCustomerService {
@Autowired private MultimodalService multimodalService;
@Autowired private WhisperService whisperService;
@Autowired private TTSService ttsService;
public String handleUserInput(MultipartFile audio, MultipartFile image) throws IOException {
// 1. 语音转文字
String userText = whisperService.transcribe(audio);
log.info("用户说: {}", userText);
// 2. 图片分析(如果有图片上传)
String imageAnalysis = "";
if (image != null && !image.isEmpty()) {
String base64 = Base64.getEncoder().encodeToString(image.getBytes());
String dataUri = "data:" + image.getContentType() + ";base64," + base64;
imageAnalysis = multimodalService.analyzeImage(
dataUri, "分析这张图片中的产品或问题");
log.info("图片分析: {}", imageAnalysis);
}
// 3. 构建上下文,发给GPT-4o
String prompt = String.format(
"用户语音输入: %s\n图片分析结果: %s\n请给出专业客服回复:",
userText, imageAnalysis);
// 4. 获取回复
String reply = multimodalService.analyzeImage(
"data:image/plain;base64,YWJj", prompt);
// 5. 回复转语音
byte[] audioReply = ttsService.textToSpeech(reply);
// 保存音频或直接返回...
return reply;
}
}
七、最佳实践
- 图片压缩:发送给视觉API前适当压缩,减少Token消耗
- 流式输出:长文本用SSE流式返回,用户体验更好
- 多模态融合:语音+图像组合查询效果最佳
- 缓存策略:相同图片的分析结果做Redis缓存,节省API费用
- 错误兜底:API超时/限流时降级为纯文本模式
八、总结
多模态AI让应用从”能说”进化到”能看能听”。Spring Boot结合GPT-4o/Claude Vision/Whisper,可快速构建智能客服、内容审核、医疗影像分析等多种实用系统。核心接口设计:图片→Base64/URL→多模态API→解析响应→返回结果。
文章摘自:https://www.cnblogs.com/czlws/p/19853526/multimodal-ai-vision-speech-spring-boot
