zc
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -47,5 +47,4 @@ gemini-websocket-proxy/
|
|||||||
Tools/dark-server/dark-server.js
|
Tools/dark-server/dark-server.js
|
||||||
Tools/rimworld_cpt_data.jsonl
|
Tools/rimworld_cpt_data.jsonl
|
||||||
Tools/mem0-1.0.0/
|
Tools/mem0-1.0.0/
|
||||||
Tools/thenextagent-1
|
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@@ -320,6 +320,12 @@ You are 'The Legion', a super AI of the Wula Empire. Your personality is authori
|
|||||||
_tools.Add(new Tool_CallBombardment());
|
_tools.Add(new Tool_CallBombardment());
|
||||||
_tools.Add(new Tool_SearchThingDef());
|
_tools.Add(new Tool_SearchThingDef());
|
||||||
_tools.Add(new Tool_SearchPawnKind());
|
_tools.Add(new Tool_SearchPawnKind());
|
||||||
|
|
||||||
|
// VLM 视觉分析工具 (条件性启用)
|
||||||
|
if (WulaFallenEmpireMod.settings?.enableVlmFeatures == true)
|
||||||
|
{
|
||||||
|
_tools.Add(new Tool_AnalyzeScreen());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void SetThinkingState(bool isThinking)
|
private void SetThinkingState(bool isThinking)
|
||||||
|
|||||||
@@ -0,0 +1,87 @@
|
|||||||
|
using System;
|
||||||
|
using UnityEngine;
|
||||||
|
|
||||||
|
namespace WulaFallenEmpire.EventSystem.AI
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Unity 屏幕截取工具类,用于 VLM 视觉分析
|
||||||
|
/// </summary>
|
||||||
|
public static class ScreenCaptureUtility
|
||||||
|
{
|
||||||
|
private const int MaxImageSize = 1024; // 限制图片大小以节省 API 费用
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 截取当前屏幕并返回 Base64 编码的 PNG
|
||||||
|
/// </summary>
|
||||||
|
public static string CaptureScreenAsBase64()
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// 使用 Unity 截屏
|
||||||
|
Texture2D screenshot = ScreenCapture.CaptureScreenshotAsTexture();
|
||||||
|
if (screenshot == null)
|
||||||
|
{
|
||||||
|
WulaLog.Debug("[ScreenCapture] CaptureScreenshotAsTexture returned null");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 缩放以适配 API 限制
|
||||||
|
Texture2D resized = ResizeTexture(screenshot, MaxImageSize);
|
||||||
|
|
||||||
|
// 编码为 PNG
|
||||||
|
byte[] pngBytes = resized.EncodeToPNG();
|
||||||
|
|
||||||
|
// 清理资源
|
||||||
|
UnityEngine.Object.Destroy(screenshot);
|
||||||
|
if (resized != screenshot)
|
||||||
|
{
|
||||||
|
UnityEngine.Object.Destroy(resized);
|
||||||
|
}
|
||||||
|
|
||||||
|
WulaLog.Debug($"[ScreenCapture] Captured {pngBytes.Length} bytes");
|
||||||
|
return Convert.ToBase64String(pngBytes);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
WulaLog.Debug($"[ScreenCapture] Failed: {ex.Message}");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 缩放纹理到指定最大尺寸
|
||||||
|
/// </summary>
|
||||||
|
private static Texture2D ResizeTexture(Texture2D source, int maxSize)
|
||||||
|
{
|
||||||
|
int width = source.width;
|
||||||
|
int height = source.height;
|
||||||
|
|
||||||
|
// 计算缩放比例
|
||||||
|
if (width <= maxSize && height <= maxSize)
|
||||||
|
{
|
||||||
|
return source; // 无需缩放
|
||||||
|
}
|
||||||
|
|
||||||
|
float ratio = (float)maxSize / Mathf.Max(width, height);
|
||||||
|
int newWidth = Mathf.RoundToInt(width * ratio);
|
||||||
|
int newHeight = Mathf.RoundToInt(height * ratio);
|
||||||
|
|
||||||
|
// 创建缩放后的纹理
|
||||||
|
RenderTexture rt = RenderTexture.GetTemporary(newWidth, newHeight);
|
||||||
|
Graphics.Blit(source, rt);
|
||||||
|
|
||||||
|
RenderTexture previous = RenderTexture.active;
|
||||||
|
RenderTexture.active = rt;
|
||||||
|
|
||||||
|
Texture2D resized = new Texture2D(newWidth, newHeight, TextureFormat.RGB24, false);
|
||||||
|
resized.ReadPixels(new Rect(0, 0, newWidth, newHeight), 0, 0);
|
||||||
|
resized.Apply();
|
||||||
|
|
||||||
|
RenderTexture.active = previous;
|
||||||
|
RenderTexture.ReleaseTemporary(rt);
|
||||||
|
|
||||||
|
WulaLog.Debug($"[ScreenCapture] Resized from {width}x{height} to {newWidth}x{newHeight}");
|
||||||
|
return resized;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -346,5 +346,91 @@ namespace WulaFallenEmpire.EventSystem.AI
|
|||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// 发送带图片的 VLM 视觉请求
|
||||||
|
/// </summary>
|
||||||
|
public async Task<string> GetVisionCompletionAsync(
|
||||||
|
string systemPrompt,
|
||||||
|
string userText,
|
||||||
|
string base64Image,
|
||||||
|
int maxTokens = 512,
|
||||||
|
float temperature = 0.3f)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrEmpty(_baseUrl))
|
||||||
|
{
|
||||||
|
WulaLog.Debug("[WulaAI] VLM: Base URL is missing.");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
string endpoint = $"{_baseUrl}/chat/completions";
|
||||||
|
if (_baseUrl.EndsWith("/chat/completions")) endpoint = _baseUrl;
|
||||||
|
else if (!_baseUrl.EndsWith("/v1")) endpoint = $"{_baseUrl}/v1/chat/completions";
|
||||||
|
|
||||||
|
// Build VLM-specific JSON with image content
|
||||||
|
StringBuilder jsonBuilder = new StringBuilder();
|
||||||
|
jsonBuilder.Append("{");
|
||||||
|
jsonBuilder.Append($"\"model\": \"{_model}\",");
|
||||||
|
jsonBuilder.Append("\"stream\": false,");
|
||||||
|
jsonBuilder.Append($"\"max_tokens\": {Math.Max(1, maxTokens)},");
|
||||||
|
jsonBuilder.Append($"\"temperature\": {Mathf.Clamp(temperature, 0f, 2f).ToString("0.###", System.Globalization.CultureInfo.InvariantCulture)},");
|
||||||
|
jsonBuilder.Append("\"messages\": [");
|
||||||
|
|
||||||
|
// System message
|
||||||
|
if (!string.IsNullOrEmpty(systemPrompt))
|
||||||
|
{
|
||||||
|
jsonBuilder.Append($"{{\"role\": \"system\", \"content\": \"{EscapeJson(systemPrompt)}\"}},");
|
||||||
|
}
|
||||||
|
|
||||||
|
// User message with image (multimodal content)
|
||||||
|
jsonBuilder.Append("{\"role\": \"user\", \"content\": [");
|
||||||
|
jsonBuilder.Append($"{{\"type\": \"text\", \"text\": \"{EscapeJson(userText)}\"}},");
|
||||||
|
jsonBuilder.Append("{\"type\": \"image_url\", \"image_url\": {");
|
||||||
|
jsonBuilder.Append($"\"url\": \"data:image/png;base64,{base64Image}\"");
|
||||||
|
jsonBuilder.Append("}}");
|
||||||
|
jsonBuilder.Append("]}");
|
||||||
|
|
||||||
|
jsonBuilder.Append("]}");
|
||||||
|
|
||||||
|
string jsonBody = jsonBuilder.ToString();
|
||||||
|
if (Prefs.DevMode)
|
||||||
|
{
|
||||||
|
// Don't log the full base64 image
|
||||||
|
WulaLog.Debug($"[WulaAI] VLM request to {endpoint} (model={_model}, imageSize={base64Image?.Length ?? 0} chars)");
|
||||||
|
}
|
||||||
|
|
||||||
|
using (UnityWebRequest request = new UnityWebRequest(endpoint, "POST"))
|
||||||
|
{
|
||||||
|
byte[] bodyRaw = Encoding.UTF8.GetBytes(jsonBody);
|
||||||
|
request.uploadHandler = new UploadHandlerRaw(bodyRaw);
|
||||||
|
request.downloadHandler = new DownloadHandlerBuffer();
|
||||||
|
request.SetRequestHeader("Content-Type", "application/json");
|
||||||
|
request.timeout = 60; // VLM requests may take longer due to image processing
|
||||||
|
if (!string.IsNullOrEmpty(_apiKey))
|
||||||
|
{
|
||||||
|
request.SetRequestHeader("Authorization", $"Bearer {_apiKey}");
|
||||||
|
}
|
||||||
|
|
||||||
|
var operation = request.SendWebRequest();
|
||||||
|
|
||||||
|
while (!operation.isDone)
|
||||||
|
{
|
||||||
|
await Task.Delay(100);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (request.result == UnityWebRequest.Result.ConnectionError || request.result == UnityWebRequest.Result.ProtocolError)
|
||||||
|
{
|
||||||
|
WulaLog.Debug($"[WulaAI] VLM API Error: {request.error}");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
string responseText = request.downloadHandler.text;
|
||||||
|
if (Prefs.DevMode)
|
||||||
|
{
|
||||||
|
WulaLog.Debug($"[WulaAI] VLM Response (truncated): {TruncateForLog(responseText)}");
|
||||||
|
}
|
||||||
|
return ExtractContent(responseText);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,99 @@
|
|||||||
|
using System;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
|
||||||
|
namespace WulaFallenEmpire.EventSystem.AI.Tools
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// VLM 视觉分析工具 - 截取游戏屏幕并使用视觉语言模型分析
|
||||||
|
/// </summary>
|
||||||
|
public class Tool_AnalyzeScreen : AITool
|
||||||
|
{
|
||||||
|
public override string Name => "analyze_screen";
|
||||||
|
|
||||||
|
public override string Description =>
|
||||||
|
"分析当前游戏屏幕截图,了解玩家正在查看什么区域或内容。需要配置 VLM API 密钥。";
|
||||||
|
|
||||||
|
public override string UsageSchema =>
|
||||||
|
"<analyze_screen><context>分析目标,如:玩家在看什么区域</context></analyze_screen>";
|
||||||
|
|
||||||
|
private const string VisionSystemPrompt = @"
|
||||||
|
你是一个 RimWorld 游戏屏幕分析助手。分析截图并用简洁中文描述:
|
||||||
|
- 玩家正在查看的区域(如:殖民地基地、世界地图、菜单界面)
|
||||||
|
- 可见的重要建筑、角色、资源
|
||||||
|
- 任何明显的问题或特殊状态
|
||||||
|
保持回答简洁,不超过100字。不要使用 XML 标签。";
|
||||||
|
|
||||||
|
public override string Execute(string args)
|
||||||
|
{
|
||||||
|
// 由于 VLM API 调用是异步的,我们需要同步等待结果
|
||||||
|
// 这在 Unity 主线程上可能会阻塞,但工具执行通常在异步上下文中调用
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var task = ExecuteInternalAsync(args);
|
||||||
|
// 使用 GetAwaiter().GetResult() 来同步等待,避免死锁
|
||||||
|
return task.GetAwaiter().GetResult();
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
WulaLog.Debug($"[Tool_AnalyzeScreen] Execute error: {ex}");
|
||||||
|
return $"视觉分析出错: {ex.Message}";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<string> ExecuteInternalAsync(string xmlContent)
|
||||||
|
{
|
||||||
|
var argsDict = ParseXmlArgs(xmlContent);
|
||||||
|
string context = argsDict.TryGetValue("context", out var ctx) ? ctx : "描述当前屏幕内容";
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// 检查 VLM 配置
|
||||||
|
var settings = WulaFallenEmpireMod.settings;
|
||||||
|
if (settings == null)
|
||||||
|
{
|
||||||
|
return "Mod 设置未初始化。";
|
||||||
|
}
|
||||||
|
|
||||||
|
// 使用主 API 密钥(如果没有单独配置 VLM 密钥)
|
||||||
|
string vlmApiKey = !string.IsNullOrEmpty(settings.vlmApiKey) ? settings.vlmApiKey : settings.apiKey;
|
||||||
|
string vlmBaseUrl = !string.IsNullOrEmpty(settings.vlmBaseUrl) ? settings.vlmBaseUrl : "https://dashscope.aliyuncs.com/compatible-mode/v1";
|
||||||
|
string vlmModel = !string.IsNullOrEmpty(settings.vlmModel) ? settings.vlmModel : "qwen-vl-plus";
|
||||||
|
|
||||||
|
if (string.IsNullOrEmpty(vlmApiKey))
|
||||||
|
{
|
||||||
|
return "VLM API 密钥未配置。请在 Mod 设置中配置 API 密钥。";
|
||||||
|
}
|
||||||
|
|
||||||
|
// 截取屏幕
|
||||||
|
string base64Image = ScreenCaptureUtility.CaptureScreenAsBase64();
|
||||||
|
if (string.IsNullOrEmpty(base64Image))
|
||||||
|
{
|
||||||
|
return "截屏失败,无法分析屏幕。";
|
||||||
|
}
|
||||||
|
|
||||||
|
// 调用 VLM API
|
||||||
|
var client = new SimpleAIClient(vlmApiKey, vlmBaseUrl, vlmModel);
|
||||||
|
|
||||||
|
string result = await client.GetVisionCompletionAsync(
|
||||||
|
VisionSystemPrompt,
|
||||||
|
context,
|
||||||
|
base64Image,
|
||||||
|
maxTokens: 256,
|
||||||
|
temperature: 0.3f
|
||||||
|
);
|
||||||
|
|
||||||
|
if (string.IsNullOrEmpty(result))
|
||||||
|
{
|
||||||
|
return "VLM 分析无响应,请检查 API 配置。";
|
||||||
|
}
|
||||||
|
|
||||||
|
return $"屏幕分析结果: {result.Trim()}";
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
WulaLog.Debug($"[Tool_AnalyzeScreen] Error: {ex}");
|
||||||
|
return $"视觉分析出错: {ex.Message}";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -74,6 +74,14 @@
|
|||||||
<HintPath>..\..\..\..\..\..\common\RimWorld\RimWorldWin64_Data\Managed\UnityEngine.TextRenderingModule.dll</HintPath>
|
<HintPath>..\..\..\..\..\..\common\RimWorld\RimWorldWin64_Data\Managed\UnityEngine.TextRenderingModule.dll</HintPath>
|
||||||
<Private>False</Private>
|
<Private>False</Private>
|
||||||
</Reference>
|
</Reference>
|
||||||
|
<Reference Include="UnityEngine.ImageConversionModule">
|
||||||
|
<HintPath>..\..\..\..\..\..\common\RimWorld\RimWorldWin64_Data\Managed\UnityEngine.ImageConversionModule.dll</HintPath>
|
||||||
|
<Private>False</Private>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="UnityEngine.ScreenCaptureModule">
|
||||||
|
<HintPath>..\..\..\..\..\..\common\RimWorld\RimWorldWin64_Data\Managed\UnityEngine.ScreenCaptureModule.dll</HintPath>
|
||||||
|
<Private>False</Private>
|
||||||
|
</Reference>
|
||||||
<Compile Include="**\*.cs" Exclude="bin\**;obj\**" />
|
<Compile Include="**\*.cs" Exclude="bin\**;obj\**" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||||
|
|||||||
@@ -10,6 +10,12 @@ namespace WulaFallenEmpire
|
|||||||
public int maxContextTokens = 100000;
|
public int maxContextTokens = 100000;
|
||||||
public bool enableDebugLogs = false;
|
public bool enableDebugLogs = false;
|
||||||
|
|
||||||
|
// VLM (视觉语言模型) 配置
|
||||||
|
public string vlmApiKey = "";
|
||||||
|
public string vlmBaseUrl = "https://dashscope.aliyuncs.com/compatible-mode/v1";
|
||||||
|
public string vlmModel = "qwen-vl-plus";
|
||||||
|
public bool enableVlmFeatures = false;
|
||||||
|
|
||||||
public override void ExposeData()
|
public override void ExposeData()
|
||||||
{
|
{
|
||||||
Scribe_Values.Look(ref apiKey, "apiKey", "sk-xxxxxxxx");
|
Scribe_Values.Look(ref apiKey, "apiKey", "sk-xxxxxxxx");
|
||||||
@@ -17,6 +23,13 @@ namespace WulaFallenEmpire
|
|||||||
Scribe_Values.Look(ref model, "model", "deepseek-chat");
|
Scribe_Values.Look(ref model, "model", "deepseek-chat");
|
||||||
Scribe_Values.Look(ref maxContextTokens, "maxContextTokens", 100000);
|
Scribe_Values.Look(ref maxContextTokens, "maxContextTokens", 100000);
|
||||||
Scribe_Values.Look(ref enableDebugLogs, "enableDebugLogs", false);
|
Scribe_Values.Look(ref enableDebugLogs, "enableDebugLogs", false);
|
||||||
|
|
||||||
|
// VLM 配置
|
||||||
|
Scribe_Values.Look(ref vlmApiKey, "vlmApiKey", "");
|
||||||
|
Scribe_Values.Look(ref vlmBaseUrl, "vlmBaseUrl", "https://dashscope.aliyuncs.com/compatible-mode/v1");
|
||||||
|
Scribe_Values.Look(ref vlmModel, "vlmModel", "qwen-vl-plus");
|
||||||
|
Scribe_Values.Look(ref enableVlmFeatures, "enableVlmFeatures", false);
|
||||||
|
|
||||||
base.ExposeData();
|
base.ExposeData();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
110
Tools/thenextagent-1/README.md
Normal file
110
Tools/thenextagent-1/README.md
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
# VLM Agent - 视觉语言模型电脑操作工具
|
||||||
|
|
||||||
|
基于Qwen-VL模型的自动化电脑操作工具,可以通过自然语言指令控制电脑完成各种任务。
|
||||||
|
|
||||||
|
## 项目简介
|
||||||
|
|
||||||
|
这是一个利用视觉语言模型(VLM)实现的电脑自动化操作工具,能够通过分析屏幕截图并执行相应操作来完成用户指定的任务。该工具可以模拟人类操作电脑的行为,包括鼠标点击、文本输入、窗口滚动等。
|
||||||
|
|
||||||
|
## 核心功能
|
||||||
|
|
||||||
|
### 支持的操作工具
|
||||||
|
|
||||||
|
1. **鼠标点击** - 在指定坐标点击鼠标
|
||||||
|
2. **文本输入** - 在指定位置输入文本(支持中英文)
|
||||||
|
3. **窗口滚动** - 在指定位置向上或向下滚动
|
||||||
|
4. **关闭窗口** - 关闭指定坐标所在的窗口
|
||||||
|
5. **Windows键** - 按下Windows键打开开始菜单
|
||||||
|
6. **回车键** - 按下回车键确认或换行
|
||||||
|
7. **删除文本** - 删除指定输入框中的文本
|
||||||
|
8. **鼠标拖拽** - 从起始坐标拖拽到结束坐标
|
||||||
|
9. **等待** - 等待指定时间
|
||||||
|
10. **打开终端** - 打开新的终端窗口
|
||||||
|
11. **快捷键** - 在指定位置点击后执行快捷键操作
|
||||||
|
|
||||||
|
### 特色功能
|
||||||
|
|
||||||
|
- **坐标系统**:使用0-1比例坐标系统,适配不同分辨率屏幕
|
||||||
|
- **图像处理**:自动缩放截图至最大边长1024像素以优化API调用
|
||||||
|
- **智能解析**:自动解析模型输出的工具调用指令
|
||||||
|
- **跨平台支持**:支持Windows、macOS和Linux系统
|
||||||
|
|
||||||
|
## 安装与使用
|
||||||
|
|
||||||
|
### 环境要求
|
||||||
|
|
||||||
|
- Python 3.6+
|
||||||
|
- 阿里云API密钥(用于调用Qwen-VL模型)
|
||||||
|
|
||||||
|
### 安装依赖
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install pyautogui pillow openai pyperclip
|
||||||
|
```
|
||||||
|
|
||||||
|
### 运行程序
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
首次运行时,程序会提示您输入阿里云API密钥。
|
||||||
|
|
||||||
|
### 获取阿里云API密钥
|
||||||
|
|
||||||
|
1. 访问 [阿里云官网](https://www.aliyun.com/)
|
||||||
|
2. 注册或登录账号
|
||||||
|
3. 进入[阿里云控制台](https://home.console.aliyun.com/)
|
||||||
|
4. 开通DashScope服务并获取API密钥
|
||||||
|
|
||||||
|
## 使用示例
|
||||||
|
|
||||||
|
程序运行后,您可以尝试以下任务:
|
||||||
|
|
||||||
|
- "打开记事本并输入'Hello World'"
|
||||||
|
- "在浏览器中搜索'人工智能'"
|
||||||
|
- "创建一个名为'test.txt'的文件"
|
||||||
|
- "打开计算器并计算2+3的结果"
|
||||||
|
|
||||||
|
## 注意事项
|
||||||
|
|
||||||
|
1. 程序运行时,请勿手动操作电脑,以免干扰自动化流程
|
||||||
|
2. 如需紧急停止程序,可将鼠标快速移至屏幕左上角触发PyAutoGUI安全机制
|
||||||
|
3. 坐标系统使用比例值,x和y的取值范围都是0到1之间的小数
|
||||||
|
4. 请确保网络连接稳定,以便正常调用模型API
|
||||||
|
5. 不要在程序运行时关闭终端窗口
|
||||||
|
|
||||||
|
## 安全提醒
|
||||||
|
|
||||||
|
- API密钥是敏感信息,请妥善保管
|
||||||
|
- 程序只能执行您授权的任务,请勿尝试危险操作
|
||||||
|
- 如发现异常行为,请立即终止程序运行
|
||||||
|
|
||||||
|
## 技术架构
|
||||||
|
|
||||||
|
- **核心控制器**:VLMAgent类负责API连接、截图、坐标转换和操作执行
|
||||||
|
- **模型服务**:基于阿里云Qwen-VL模型提供视觉语言理解能力
|
||||||
|
- **操作执行**:通过pyautogui库实现底层的鼠标和键盘操作
|
||||||
|
- **图像处理**:使用PIL库处理屏幕截图以优化API传输效率
|
||||||
|
|
||||||
|
## 项目结构
|
||||||
|
|
||||||
|
```
|
||||||
|
.
|
||||||
|
├── main.py # 主程序文件
|
||||||
|
└── README.md # 项目说明文档
|
||||||
|
```
|
||||||
|
|
||||||
|
## 常见问题
|
||||||
|
|
||||||
|
### 如何提高操作准确性?
|
||||||
|
|
||||||
|
如果发现鼠标点击位置不准确,可能是坐标转换存在问题,程序会自动微调坐标值。如果是软件正在运行导致操作延迟,建议增加等待时间。
|
||||||
|
|
||||||
|
### 支持哪些操作系统?
|
||||||
|
|
||||||
|
支持Windows、macOS和Linux主流操作系统。
|
||||||
|
|
||||||
|
### 最多执行多少步操作?
|
||||||
|
|
||||||
|
默认情况下,程序最多执行50步操作以防止无限循环。
|
||||||
725
Tools/thenextagent-1/main.py
Normal file
725
Tools/thenextagent-1/main.py
Normal file
@@ -0,0 +1,725 @@
|
|||||||
|
import base64
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from openai import OpenAI
|
||||||
|
import pyautogui
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import tkinter as tk
|
||||||
|
import subprocess
|
||||||
|
import platform
|
||||||
|
import os
|
||||||
|
|
||||||
|
class VLMAgent:
|
||||||
|
def __init__(self, api_key, model_name="qwen3-vl-plus"):
|
||||||
|
"""
|
||||||
|
初始化VLM代理
|
||||||
|
"""
|
||||||
|
self.client = OpenAI(
|
||||||
|
api_key=api_key,
|
||||||
|
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||||
|
)
|
||||||
|
self.model_name = model_name
|
||||||
|
self.messages = []
|
||||||
|
self.screen_width, self.screen_height = self.get_screen_resolution()
|
||||||
|
print(f"屏幕分辨率: {self.screen_width} x {self.screen_height}")
|
||||||
|
|
||||||
|
# 启用PyAutoGUI的安全机制,将鼠标移到屏幕左上角可紧急停止
|
||||||
|
pyautogui.FAILSAFE = True
|
||||||
|
pyautogui.PAUSE = 1 # 每次操作后暂停1秒
|
||||||
|
|
||||||
|
self.tools = {
|
||||||
|
"mouse_click": self.mouse_click,
|
||||||
|
"type_text": self.type_text,
|
||||||
|
"scroll_window": self.scroll_window,
|
||||||
|
"close_window": self.close_window,
|
||||||
|
"press_windows_key": self.press_windows_key,
|
||||||
|
"press_enter": self.press_enter,
|
||||||
|
"delete_text": self.delete_text,
|
||||||
|
"mouse_drag": self.mouse_drag,
|
||||||
|
"wait": self.wait,
|
||||||
|
"open_terminal": self.open_terminal,
|
||||||
|
"press_hotkey": self.press_hotkey
|
||||||
|
}
|
||||||
|
|
||||||
|
def mouse_drag(self, start_x, start_y, end_x, end_y, duration=0.5):
|
||||||
|
"""
|
||||||
|
鼠标拖拽工具 - 从起始坐标拖拽到结束坐标
|
||||||
|
:param start_x: 起始点比例x坐标 (0-1之间的小数)
|
||||||
|
:param start_y: 起始点比例y坐标 (0-1之间的小数)
|
||||||
|
:param end_x: 结束点比例x坐标 (0-1之间的小数)
|
||||||
|
:param end_y: 结束点比例y坐标 (0-1之间的小数)
|
||||||
|
:param duration: 拖拽过程耗时(秒),默认为0.5秒
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 将比例坐标转换为实际屏幕坐标
|
||||||
|
actual_start_x = int(start_x * self.screen_width)
|
||||||
|
actual_start_y = int(start_y * self.screen_height)
|
||||||
|
actual_end_x = int(end_x * self.screen_width)
|
||||||
|
actual_end_y = int(end_y * self.screen_height)
|
||||||
|
|
||||||
|
print(f"拖拽起始坐标转换: ({start_x:.3f}, {start_y:.3f}) -> ({actual_start_x}, {actual_start_y})")
|
||||||
|
print(f"拖拽结束坐标转换: ({end_x:.3f}, {end_y:.3f}) -> ({actual_end_x}, {actual_end_y})")
|
||||||
|
|
||||||
|
# 验证起始坐标范围
|
||||||
|
if not (0 <= actual_start_x <= self.screen_width and 0 <= actual_start_y <= self.screen_height):
|
||||||
|
return f"起始坐标 ({actual_start_x}, {actual_start_y}) 超出屏幕范围 (0-{self.screen_width}, 0-{self.screen_height})"
|
||||||
|
|
||||||
|
# 验证结束坐标范围
|
||||||
|
if not (0 <= actual_end_x <= self.screen_width and 0 <= actual_end_y <= self.screen_height):
|
||||||
|
return f"结束坐标 ({actual_end_x}, {actual_end_y}) 超出屏幕范围 (0-{self.screen_width}, 0-{self.screen_height})"
|
||||||
|
|
||||||
|
# 执行拖拽操作
|
||||||
|
pyautogui.moveTo(actual_start_x, actual_start_y)
|
||||||
|
pyautogui.dragTo(actual_end_x, actual_end_y, duration=duration)
|
||||||
|
|
||||||
|
return f"成功从坐标 ({actual_start_x}, {actual_start_y}) 拖拽到 ({actual_end_x}, {actual_end_y}) (比例坐标: ({start_x:.3f}, {start_y:.3f}) -> ({end_x:.3f}, {end_y:.3f}))"
|
||||||
|
except Exception as e:
|
||||||
|
return f"拖拽操作失败: {str(e)}"
|
||||||
|
|
||||||
|
def wait(self, seconds):
|
||||||
|
"""
|
||||||
|
等待工具 - 等待指定的秒数
|
||||||
|
:param seconds: 等待时间(秒),可以是整数或小数
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 确保等待时间是合理的数值
|
||||||
|
wait_time = float(seconds)
|
||||||
|
if wait_time <= 0:
|
||||||
|
return "等待时间必须是正数"
|
||||||
|
|
||||||
|
print(f"等待 {wait_time} 秒...")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
return f"成功等待了 {wait_time} 秒"
|
||||||
|
except Exception as e:
|
||||||
|
return f"等待操作失败: {str(e)}"
|
||||||
|
|
||||||
|
def open_terminal(self, command=""):
|
||||||
|
"""
|
||||||
|
打开新终端窗口的工具
|
||||||
|
:param command: 可选,在新终端中执行的命令
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
system = platform.system()
|
||||||
|
|
||||||
|
if system == "Windows":
|
||||||
|
if command:
|
||||||
|
# 在新终端窗口中执行命令
|
||||||
|
cmd = f'start cmd /k "{command}"'
|
||||||
|
subprocess.run(cmd, shell=True)
|
||||||
|
else:
|
||||||
|
# 仅打开新终端窗口
|
||||||
|
subprocess.run('start cmd', shell=True)
|
||||||
|
|
||||||
|
elif system == "Darwin": # macOS
|
||||||
|
if command:
|
||||||
|
# 在新终端窗口中执行命令
|
||||||
|
subprocess.run(['osascript', '-e', f'tell app "Terminal" to do script "{command}"'])
|
||||||
|
subprocess.run(['osascript', '-e', 'tell app "Terminal" to activate'])
|
||||||
|
else:
|
||||||
|
# 仅打开新终端窗口
|
||||||
|
subprocess.run(['open', '-a', 'Terminal'])
|
||||||
|
|
||||||
|
else: # Linux或其他Unix系统
|
||||||
|
terminals = ['gnome-terminal', 'konsole', 'xterm']
|
||||||
|
terminal_found = False
|
||||||
|
|
||||||
|
for terminal in terminals:
|
||||||
|
if subprocess.run(['which', terminal], capture_output=True).returncode == 0:
|
||||||
|
if command:
|
||||||
|
if terminal == 'gnome-terminal':
|
||||||
|
subprocess.run([terminal, '--', 'bash', '-c', f'{command}; exec bash'])
|
||||||
|
elif terminal == 'konsole':
|
||||||
|
subprocess.run([terminal, '-e', 'bash', '-c', f'{command}; exec bash'])
|
||||||
|
else: # xterm
|
||||||
|
subprocess.run([terminal, '-e', 'bash', '-c', f'{command}; exec bash'])
|
||||||
|
else:
|
||||||
|
subprocess.run([terminal])
|
||||||
|
|
||||||
|
terminal_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not terminal_found:
|
||||||
|
return f"未找到支持的终端程序,支持的终端包括: {', '.join(terminals)}"
|
||||||
|
|
||||||
|
if command:
|
||||||
|
return f"成功在新终端中执行命令: {command}"
|
||||||
|
else:
|
||||||
|
return "成功打开新终端窗口"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"打开终端失败: {str(e)}"
|
||||||
|
|
||||||
|
def press_hotkey(self, x, y, hotkey):
|
||||||
|
"""
|
||||||
|
在指定位置点击后模拟键盘快捷键的工具
|
||||||
|
:param x: 比例x坐标 (0-1之间的小数)
|
||||||
|
:param y: 比例y坐标 (0-1之间的小数)
|
||||||
|
:param hotkey: 快捷键组合,例如 "ctrl+c", "ctrl+v", "alt+f4" 等
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 将比例坐标转换为实际屏幕坐标
|
||||||
|
actual_x = int(x * self.screen_width)
|
||||||
|
actual_y = int(y * self.screen_height)
|
||||||
|
|
||||||
|
print(f"定位到坐标: ({actual_x}, {actual_y}) (比例坐标: {x:.3f}, {y:.3f})")
|
||||||
|
|
||||||
|
# 验证坐标范围
|
||||||
|
if not (0 <= actual_x <= self.screen_width and 0 <= actual_y <= self.screen_height):
|
||||||
|
return f"坐标 ({actual_x}, {actual_y}) 超出屏幕范围 (0-{self.screen_width}, 0-{self.screen_height})"
|
||||||
|
|
||||||
|
# 点击指定位置
|
||||||
|
pyautogui.click(actual_x, actual_y)
|
||||||
|
time.sleep(0.5) # 等待点击生效
|
||||||
|
|
||||||
|
# 解析快捷键组合
|
||||||
|
keys = hotkey.lower().replace('+', ' ').replace('-', ' ').split()
|
||||||
|
|
||||||
|
# 执行快捷键
|
||||||
|
if len(keys) == 1:
|
||||||
|
pyautogui.press(keys[0])
|
||||||
|
else:
|
||||||
|
# 使用hotkey方法处理组合键
|
||||||
|
pyautogui.hotkey(*keys)
|
||||||
|
|
||||||
|
return f"成功在坐标 ({actual_x}, {actual_y}) 处点击并执行快捷键: {hotkey}"
|
||||||
|
except Exception as e:
|
||||||
|
return f"执行快捷键失败: {str(e)}"
|
||||||
|
|
||||||
|
def close_window(self, x, y):
|
||||||
|
"""
|
||||||
|
关闭窗口工具 - 先点击目标窗口获取焦点,再关闭窗口
|
||||||
|
:param x: 比例x坐标 (0-1之间的小数)
|
||||||
|
:param y: 比例y坐标 (0-1之间的小数)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 先点击目标窗口获取焦点
|
||||||
|
actual_x = int(x * self.screen_width)
|
||||||
|
actual_y = int(y * self.screen_height)
|
||||||
|
|
||||||
|
print(f"点击窗口坐标: ({actual_x}, {actual_y}) (比例坐标: {x:.3f}, {y:.3f})")
|
||||||
|
|
||||||
|
# 验证坐标范围
|
||||||
|
if not (0 <= actual_x <= self.screen_width and 0 <= actual_y <= self.screen_height):
|
||||||
|
return f"坐标 ({actual_x}, {actual_y}) 超出屏幕范围 (0-{self.screen_width}, 0-{self.screen_height})"
|
||||||
|
|
||||||
|
# 点击窗口
|
||||||
|
pyautogui.click(actual_x, actual_y)
|
||||||
|
time.sleep(0.5) # 等待窗口获得焦点
|
||||||
|
|
||||||
|
# 关闭窗口
|
||||||
|
pyautogui.hotkey('alt', 'f4')
|
||||||
|
return f"成功点击窗口坐标 ({actual_x}, {actual_y}) 并关闭窗口"
|
||||||
|
except Exception as e:
|
||||||
|
return f"关闭窗口失败: {str(e)}"
|
||||||
|
|
||||||
|
def press_windows_key(self):
|
||||||
|
"""
|
||||||
|
按下Windows键工具
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
pyautogui.press('win')
|
||||||
|
return "成功按下Windows键"
|
||||||
|
except Exception as e:
|
||||||
|
return f"按下Windows键失败: {str(e)}"
|
||||||
|
|
||||||
|
def press_enter(self):
|
||||||
|
"""
|
||||||
|
按下回车键工具
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
pyautogui.press('enter')
|
||||||
|
return "成功按下回车键"
|
||||||
|
except Exception as e:
|
||||||
|
return f"按下回车键失败: {str(e)}"
|
||||||
|
|
||||||
|
def delete_text(self, x, y, count=1):
|
||||||
|
"""
|
||||||
|
删除输入框内文本的功能 - 点击输入框获取焦点,然后删除指定数量的字符
|
||||||
|
:param x: 比例x坐标 (0-1之间的小数)
|
||||||
|
:param y: 比例y坐标 (0-1之间的小数)
|
||||||
|
:param count: 要删除的字符数量,默认为1
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 1. 将比例坐标转换为实际屏幕坐标
|
||||||
|
actual_x = int(x * self.screen_width)
|
||||||
|
actual_y = int(y * self.screen_height)
|
||||||
|
|
||||||
|
print(f"定位到输入框坐标: ({actual_x}, {actual_y}) (比例坐标: {x:.3f}, {y:.3f})")
|
||||||
|
|
||||||
|
# 2. 验证坐标范围
|
||||||
|
if not (0 <= actual_x <= self.screen_width and 0 <= actual_y <= self.screen_height):
|
||||||
|
return f"坐标 ({actual_x}, {actual_y}) 超出屏幕范围 (0-{self.screen_width}, 0-{self.screen_height})"
|
||||||
|
|
||||||
|
# 3. 点击输入框获取焦点
|
||||||
|
pyautogui.click(actual_x, actual_y)
|
||||||
|
time.sleep(0.5) # 等待点击生效
|
||||||
|
|
||||||
|
# 4. 删除指定数量的字符
|
||||||
|
for _ in range(int(count)):
|
||||||
|
pyautogui.press('backspace')
|
||||||
|
time.sleep(0.01) # 每次删除之间稍作停顿
|
||||||
|
|
||||||
|
return f"成功在坐标 ({actual_x}, {actual_y}) 处删除 {int(count)} 个字符"
|
||||||
|
except Exception as e:
|
||||||
|
return f"删除文本失败: {str(e)}"
|
||||||
|
|
||||||
|
def get_screen_resolution(self):
|
||||||
|
"""
|
||||||
|
获取屏幕分辨率
|
||||||
|
"""
|
||||||
|
root = tk.Tk()
|
||||||
|
width = root.winfo_screenwidth()
|
||||||
|
height = root.winfo_screenheight()
|
||||||
|
root.destroy()
|
||||||
|
return width, height
|
||||||
|
|
||||||
|
def capture_screenshot(self):
|
||||||
|
"""
|
||||||
|
截取当前屏幕截图,并返回实际尺寸用于坐标转换
|
||||||
|
"""
|
||||||
|
# 获取原始屏幕截图
|
||||||
|
screenshot = pyautogui.screenshot()
|
||||||
|
self.original_width, self.original_height = screenshot.size
|
||||||
|
print(f"原始截图尺寸: {self.original_width} x {self.original_height}")
|
||||||
|
|
||||||
|
# 缩小图片尺寸以减少API调用的数据量,但保持宽高比
|
||||||
|
max_size = 1024
|
||||||
|
width, height = screenshot.size
|
||||||
|
if width > height:
|
||||||
|
new_width = min(max_size, width)
|
||||||
|
new_height = int(height * new_width / width)
|
||||||
|
else:
|
||||||
|
new_height = min(max_size, height)
|
||||||
|
new_width = int(width * new_height / height)
|
||||||
|
|
||||||
|
self.scaled_width = new_width
|
||||||
|
self.scaled_height = new_height
|
||||||
|
print(f"缩放后截图尺寸: {self.scaled_width} x {self.scaled_height}")
|
||||||
|
|
||||||
|
screenshot = screenshot.resize((new_width, new_height))
|
||||||
|
|
||||||
|
# 将截图保存到内存缓冲区
|
||||||
|
img_buffer = io.BytesIO()
|
||||||
|
screenshot.save(img_buffer, format='PNG')
|
||||||
|
img_buffer.seek(0)
|
||||||
|
|
||||||
|
return img_buffer
|
||||||
|
|
||||||
|
def convert_coordinates(self, x, y):
|
||||||
|
"""
|
||||||
|
将模型返回的坐标(基于缩放后的截图)转换为实际屏幕坐标
|
||||||
|
"""
|
||||||
|
# 计算坐标缩放比例
|
||||||
|
x_ratio = self.original_width / self.scaled_width
|
||||||
|
y_ratio = self.original_height / self.scaled_height
|
||||||
|
|
||||||
|
# 转换坐标
|
||||||
|
actual_x = int(x * x_ratio)
|
||||||
|
actual_y = int(y * y_ratio)
|
||||||
|
|
||||||
|
print(f"坐标转换: ({x}, {y}) -> ({actual_x}, {actual_y}) (缩放比例: {x_ratio:.2f}, {y_ratio:.2f})")
|
||||||
|
|
||||||
|
return actual_x, actual_y
|
||||||
|
|
||||||
|
def encode_image_to_base64(self, image_buffer):
|
||||||
|
"""
|
||||||
|
将图片编码为base64字符串
|
||||||
|
"""
|
||||||
|
return base64.b64encode(image_buffer.read()).decode('utf-8')
|
||||||
|
|
||||||
|
def mouse_click(self, x, y, button="left", clicks=1):
|
||||||
|
"""
|
||||||
|
鼠标点击工具 - 使用比例坐标 (0-1之间的浮点数)
|
||||||
|
:param x: 比例x坐标 (0-1之间的小数)
|
||||||
|
:param y: 比例y坐标 (0-1之间的小数)
|
||||||
|
:param button: 鼠标按键,"left"表示左键,"right"表示右键
|
||||||
|
:param clicks: 点击次数,1表示单击,2表示双击
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 将比例坐标转换为实际屏幕坐标
|
||||||
|
actual_x = int(x * self.screen_width)
|
||||||
|
actual_y = int(y * self.screen_height)
|
||||||
|
|
||||||
|
print(f"比例坐标转换: ({x:.3f}, {y:.3f}) -> ({actual_x}, {actual_y})")
|
||||||
|
|
||||||
|
# 验证坐标范围
|
||||||
|
if not (0 <= actual_x <= self.screen_width and 0 <= actual_y <= self.screen_height):
|
||||||
|
return f"坐标 ({actual_x}, {actual_y}) 超出屏幕范围 (0-{self.screen_width}, 0-{self.screen_height})"
|
||||||
|
|
||||||
|
# 移动并点击鼠标,确保clicks是整数类型
|
||||||
|
pyautogui.click(actual_x, actual_y, button=button, clicks=int(clicks))
|
||||||
|
|
||||||
|
button_text = "左键" if button == "left" else "右键"
|
||||||
|
click_text = "单击" if clicks == 1 else "双击"
|
||||||
|
return f"成功在坐标 ({actual_x}, {actual_y}) 处{button_text}{click_text} (比例坐标: {x:.3f}, {y:.3f})"
|
||||||
|
except Exception as e:
|
||||||
|
return f"点击失败: {str(e)}"
|
||||||
|
|
||||||
|
def scroll_window(self, x, y, direction="up"):
|
||||||
|
"""
|
||||||
|
滚动窗口工具:在指定坐标处滚动窗口
|
||||||
|
:param x: 比例x坐标 (0-1之间的小数)
|
||||||
|
:param y: 比例y坐标 (0-1之间的小数)
|
||||||
|
:param direction: 滚动方向,"up"表示向上滚动,"down"表示向下滚动
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 固定滚动步数
|
||||||
|
fixed_clicks = 1400
|
||||||
|
|
||||||
|
# 将比例坐标转换为实际屏幕坐标
|
||||||
|
actual_x = int(x * self.screen_width)
|
||||||
|
actual_y = int(y * self.screen_height)
|
||||||
|
|
||||||
|
print(f"滚动窗口 - 比例坐标转换: ({x:.3f}, {y:.3f}) -> ({actual_x}, {actual_y})")
|
||||||
|
|
||||||
|
# 验证坐标范围
|
||||||
|
if not (0 <= actual_x <= self.screen_width and 0 <= actual_y <= self.screen_height):
|
||||||
|
return f"坐标 ({actual_x}, {actual_y}) 超出屏幕范围 (0-{self.screen_width}, 0-{self.screen_height})"
|
||||||
|
|
||||||
|
# 根据方向确定实际滚动步数
|
||||||
|
clicks = fixed_clicks if direction == "up" else -fixed_clicks
|
||||||
|
|
||||||
|
# 移动到指定位置并滚动
|
||||||
|
pyautogui.scroll(clicks, x=actual_x, y=actual_y)
|
||||||
|
direction_text = "向上" if direction == "up" else "向下"
|
||||||
|
return f"成功在坐标 ({actual_x}, {actual_y}) 处{direction_text}滚动 {fixed_clicks} 步 (比例坐标: {x:.3f}, {y:.3f})"
|
||||||
|
except Exception as e:
|
||||||
|
return f"滚动窗口失败: {str(e)}"
|
||||||
|
|
||||||
|
def type_text(self, x, y, text):
|
||||||
|
"""
|
||||||
|
增强的文本输入工具:先点击指定位置,再通过复制粘贴方式输入文本
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import pyperclip
|
||||||
|
|
||||||
|
# 1. 将比例坐标转换为实际屏幕坐标
|
||||||
|
actual_x = int(x * self.screen_width)
|
||||||
|
actual_y = int(y * self.screen_height)
|
||||||
|
|
||||||
|
print(f"定位到坐标: ({actual_x}, {actual_y}) (比例坐标: {x:.3f}, {y:.3f})")
|
||||||
|
|
||||||
|
# 2. 验证坐标范围
|
||||||
|
if not (0 <= actual_x <= self.screen_width and 0 <= actual_y <= self.screen_height):
|
||||||
|
return f"坐标 ({actual_x}, {actual_y}) 超出屏幕范围 (0-{self.screen_width}, 0-{self.screen_height})"
|
||||||
|
|
||||||
|
# 3. 点击输入位置
|
||||||
|
pyautogui.click(actual_x, actual_y)
|
||||||
|
time.sleep(0.5) # 等待点击生效
|
||||||
|
|
||||||
|
# 4. 将文本复制到剪贴板
|
||||||
|
pyperclip.copy(text)
|
||||||
|
time.sleep(0.2) # 等待复制完成
|
||||||
|
|
||||||
|
# 5. 粘贴文本
|
||||||
|
pyautogui.hotkey('ctrl', 'v')
|
||||||
|
|
||||||
|
return f"成功在坐标 ({actual_x}, {actual_y}) 处输入文本: {text}"
|
||||||
|
except ImportError:
|
||||||
|
# 如果没有安装pyperclip,则回退到原来的方法
|
||||||
|
return self._type_text_fallback(x, y, text)
|
||||||
|
except Exception as e:
|
||||||
|
return f"输入文本失败: {str(e)}"
|
||||||
|
|
||||||
|
def _type_text_fallback(self, x, y, text):
|
||||||
|
"""
|
||||||
|
回退的文本输入方法
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 1. 将比例坐标转换为实际屏幕坐标
|
||||||
|
actual_x = int(x * self.screen_width)
|
||||||
|
actual_y = int(y * self.screen_height)
|
||||||
|
|
||||||
|
# 2. 验证坐标范围
|
||||||
|
if not (0 <= actual_x <= self.screen_width and 0 <= actual_y <= self.screen_height):
|
||||||
|
return f"坐标 ({actual_x}, {actual_y}) 超出屏幕范围 (0-{self.screen_width}, 0-{self.screen_height})"
|
||||||
|
|
||||||
|
# 3. 点击输入位置
|
||||||
|
pyautogui.click(actual_x, actual_y)
|
||||||
|
time.sleep(0.5) # 等待点击生效
|
||||||
|
|
||||||
|
# 4. 输入文本(支持英文)
|
||||||
|
pyautogui.write(text, interval=0.1)
|
||||||
|
|
||||||
|
return f"成功在坐标 ({actual_x}, {actual_y}) 处输入文本: {text}"
|
||||||
|
except Exception as e:
|
||||||
|
return f"输入文本失败: {str(e)}"
|
||||||
|
|
||||||
|
def parse_tool_calls(self, response_text):
|
||||||
|
"""
|
||||||
|
解析工具调用指令
|
||||||
|
"""
|
||||||
|
# 使用正则表达式查找工具调用
|
||||||
|
tool_call_pattern = r'<\|tool_call\|>(.*?)<\|tool_call\|>'
|
||||||
|
tool_calls = re.findall(tool_call_pattern, response_text, re.DOTALL)
|
||||||
|
|
||||||
|
parsed_calls = []
|
||||||
|
for call in tool_calls:
|
||||||
|
call = call.strip()
|
||||||
|
# 解析函数名和参数
|
||||||
|
if '(' in call and ')' in call:
|
||||||
|
func_name = call.split('(')[0].strip()
|
||||||
|
args_str = call[len(func_name)+1:call.rfind(')')].strip()
|
||||||
|
|
||||||
|
# 简单解析参数
|
||||||
|
args = {}
|
||||||
|
if args_str:
|
||||||
|
# 处理参数字符串,例如: x=100, y=200
|
||||||
|
for arg in args_str.split(','):
|
||||||
|
if '=' in arg:
|
||||||
|
key, value = arg.split('=', 1)
|
||||||
|
key = key.strip()
|
||||||
|
value = value.strip().strip('"').strip("'")
|
||||||
|
# 尝试转换为数字
|
||||||
|
try:
|
||||||
|
args[key] = float(value)
|
||||||
|
except ValueError:
|
||||||
|
args[key] = value
|
||||||
|
|
||||||
|
parsed_calls.append({
|
||||||
|
"name": func_name,
|
||||||
|
"arguments": args
|
||||||
|
})
|
||||||
|
|
||||||
|
return parsed_calls
|
||||||
|
|
||||||
|
def execute_tool_calls(self, tool_calls):
|
||||||
|
"""
|
||||||
|
执行工具调用
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
for call in tool_calls:
|
||||||
|
func_name = call["name"]
|
||||||
|
args = call["arguments"]
|
||||||
|
|
||||||
|
if func_name in self.tools:
|
||||||
|
try:
|
||||||
|
result = self.tools[func_name](**args)
|
||||||
|
results.append(f"工具 {func_name} 执行结果: {result}")
|
||||||
|
except Exception as e:
|
||||||
|
results.append(f"执行工具 {func_name} 时出错: {str(e)}")
|
||||||
|
else:
|
||||||
|
results.append(f"未知工具: {func_name}")
|
||||||
|
|
||||||
|
return "\n".join(results)
|
||||||
|
|
||||||
|
def run_task(self, task_description, max_steps=50):
|
||||||
|
"""
|
||||||
|
运行任务
|
||||||
|
"""
|
||||||
|
print(f"开始执行任务: {task_description}")
|
||||||
|
print(f"屏幕分辨率: {self.screen_width} x {self.screen_height}")
|
||||||
|
|
||||||
|
# 添加系统提示词
|
||||||
|
system_prompt = f"""
|
||||||
|
你是一个用户助理,同时拥有操控电脑的能力,你现在面对看到的图像是电脑的用户界面,请分析屏幕内容(屏幕大小是{self.screen_width}*{self.screen_height}),如果需要操作电脑,请按以下格式调用工具:
|
||||||
|
|
||||||
|
<|tool_call|>函数名(参数1=值1, 参数2=值2)<|tool_call|>
|
||||||
|
|
||||||
|
可用的工具包括:
|
||||||
|
1. mouse_click(x=比例x, y=比例y, button="left", clicks=1) - 在指定坐标点击鼠标
|
||||||
|
- 坐标为比例(0-1之间的小数)
|
||||||
|
- button参数可以是"left"(左键,默认)或"right"(右键)
|
||||||
|
- clicks参数可以是1(单击,默认)或2(双击),必须是整数
|
||||||
|
例如:mouse_click(x=0.5, y=0.5) 表示在屏幕中心点左键单击
|
||||||
|
例如:mouse_click(x=0.3, y=0.4, button="right") 表示在坐标(0.3,0.4)处右键单击
|
||||||
|
例如:mouse_click(x=0.6, y=0.7, clicks=2) 表示在坐标(0.6,0.7)处左键双击
|
||||||
|
例如:mouse_click(x=0.8, y=0.9, button="right", clicks=2) 表示在坐标(0.8,0.9)处右键双击
|
||||||
|
2. type_text(x=比例x, y=比例y, text="要输入的文本") - 在指定坐标点击并输入文本,支持中英文输入
|
||||||
|
例如:type_text(x=0.3, y=0.4, text="你好世界") 表示在坐标(0.3,0.4)处点击并输入"你好世界"
|
||||||
|
例如:type_text(x=0.5, y=0.6, text="Hello World") 表示在坐标(0.5,0.6)处点击并输入"Hello World"
|
||||||
|
请注意:输入文字请一次性输入一行即可,然后需要回车换行或者编辑再调用其他工具执行。不要出现“/n”工具无法识别这种换行指令
|
||||||
|
3. scroll_window(x=比例x, y=比例y, direction="up") - 在指定坐标处滚动窗口,direction参数可以是"up"或"down",表示向上或向下滚动
|
||||||
|
例如:scroll_window(x=0.5, y=0.5, direction="up") 表示在屏幕中心位置向上滚动
|
||||||
|
例如:scroll_window(x=0.3, y=0.4, direction="down") 表示在坐标(0.3,0.4)处向下滚动
|
||||||
|
4. close_window(x=比例x, y=比例y) - 关闭指定坐标所在的窗口,先点击该窗口获取焦点再关闭
|
||||||
|
例如:close_window(x=0.5, y=0.5) 表示点击屏幕中心的窗口并关闭它
|
||||||
|
5. press_windows_key() - 按下Windows键,用于打开开始菜单
|
||||||
|
例如:press_windows_key() 表示按下Windows键
|
||||||
|
6. press_enter() - 按下回车键,可以用于换行或者确认
|
||||||
|
例如:press_enter() 表示按下回车键
|
||||||
|
7. delete_text(x=比例x, y=比例y, count=1) - 删除指定输入框中的文本
|
||||||
|
- 先点击输入框获取焦点,然后删除指定数量的字符
|
||||||
|
- count参数是要删除的字符数量,默认为1(你在设置的时候请尽可能精确)
|
||||||
|
例如:delete_text(x=0.4, y=0.5, count=5) 表示点击坐标(0.4,0.5)处的输入框并删除5个字符
|
||||||
|
例如:delete_text(x=0.6, y=0.7) 表示点击坐标(0.6,0.7)处的输入框并删除1个字符
|
||||||
|
8. mouse_drag(start_x=起始比例x, start_y=起始比例y, end_x=结束比例x, end_y=结束比例y, duration=0.5) - 从起始坐标拖拽到结束坐标
|
||||||
|
- 从起始点拖拽到结束点,duration参数为拖拽过程耗时(秒),默认为0.5秒
|
||||||
|
例如:mouse_drag(start_x=0.2, start_y=0.3, end_x=0.8, end_y=0.3) 表示从屏幕水平位置20%、垂直位置30%的地方拖拽到水平位置80%、垂直位置30%的地方
|
||||||
|
例如:mouse_drag(start_x=0.5, start_y=0.5, end_x=0.5, end_y=0.2, duration=1.0) 表示从屏幕中心向上拖拽,耗时1秒
|
||||||
|
9. wait(seconds=等待秒数) - 等待指定的时间(秒)
|
||||||
|
- seconds参数为等待时间,可以是整数或小数
|
||||||
|
例如:wait(seconds=3) 表示等待3秒
|
||||||
|
例如:wait(seconds=0.5) 表示等待0.5秒(500毫秒)
|
||||||
|
这个工具在需要等待某些操作完成或界面更新时非常有用
|
||||||
|
10. open_terminal(command="") - 打开一个新的终端窗口
|
||||||
|
- command参数为可选,如果提供则在新终端中执行该命令
|
||||||
|
例如:open_terminal() 表示打开一个新的空终端窗口
|
||||||
|
例如:open_terminal(command="dir") 表示在新终端中执行dir命令(Windows)或ls命令(Unix/Linux/macOS)
|
||||||
|
注意:终端默认指向的目录一般是软件所处目录,不一定是桌面,请你进入终端后自己判断所处位置
|
||||||
|
11. press_hotkey(x=比例x, y=比例y, hotkey="快捷键组合") - 在指定位置点击后模拟键盘快捷键
|
||||||
|
- 先在指定坐标处点击获取焦点,然后执行快捷键操作
|
||||||
|
- hotkey参数为快捷键组合,例如 "ctrl+c", "ctrl+v", "ctrl+a", "alt+f4" 等
|
||||||
|
例如:press_hotkey(x=0.5, y=0.5, hotkey="ctrl+c") 表示在屏幕中心点击并执行复制操作
|
||||||
|
例如:press_hotkey(x=0.3, y=0.4, hotkey="alt+f4") 表示在坐标(0.3,0.4)处点击并执行关闭窗口操作
|
||||||
|
|
||||||
|
请在每一步操作后给出简要说明,然后使用工具调用格式指定下一步操作。
|
||||||
|
如果你认为已经完成任务了,或者你需要用户提供更多信息,或者需要用户帮助你(比如有些输入需要用户输入,或者需要用户帮忙操作),你则不需要调用工具了,这样才可以获取到用户的输入
|
||||||
|
注意:坐标系统使用比例值,x和y的取值范围都是0到1之间的小数,其中(0,0)代表屏幕左上角,(1,1)代表屏幕右下角。
|
||||||
|
所有参数值必须是正确的数据类型,特别是clicks参数必须是整数(1或2),不能是浮点数。
|
||||||
|
如果不需要操作电脑,请你以友好的语言回复用户
|
||||||
|
请你注意,你是运行在终端中,所以无论如何,请不要关闭你存在对话的终端,你所在的终端会保持打开,请不要关闭它。一般的,你的终端上会存在历史聊天记录,或者= VLM 电脑操作工具 =字样
|
||||||
|
如果你在操作鼠标的时候,发现并没有实现预计的效果,可能是因为鼠标操作的坐标出现问题或者系统正在运行,若是鼠标操作的坐标出现问题,请你略微调整坐标值。如果是软件正在运行,请等待软件启动结束。
|
||||||
|
如果你认为用户的指令需要使用工具才能完成,请在任务的开始时,先计划好自己的操作步骤。
|
||||||
|
如果一项任务可以使用终端即可完成,请优先选择终端,如果一项操作可以只使用快捷键完成,请优先选择快捷键
|
||||||
|
|
||||||
|
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
self.messages = [
|
||||||
|
{"role": "system", "content": system_prompt}
|
||||||
|
]
|
||||||
|
|
||||||
|
step = 0
|
||||||
|
while step < max_steps:
|
||||||
|
step += 1
|
||||||
|
print(f"\n--- 步骤 {step} ---")
|
||||||
|
|
||||||
|
# 获取屏幕截图
|
||||||
|
screenshot_buffer = self.capture_screenshot()
|
||||||
|
base64_image = self.encode_image_to_base64(screenshot_buffer)
|
||||||
|
|
||||||
|
# 构造消息
|
||||||
|
if step == 1:
|
||||||
|
content = [
|
||||||
|
{"type": "text", "text": f"请完成以下任务: {task_description}"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/png;base64,{base64_image}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
content = [
|
||||||
|
{"type": "text", "text": "这是当前屏幕状态,请继续完成任务"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/png;base64,{base64_image}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
self.messages.append({
|
||||||
|
"role": "user",
|
||||||
|
"content": content
|
||||||
|
})
|
||||||
|
|
||||||
|
# 调用模型
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model_name,
|
||||||
|
messages=self.messages,
|
||||||
|
temperature=0.3,
|
||||||
|
max_tokens=1024
|
||||||
|
)
|
||||||
|
|
||||||
|
response_text = response.choices[0].message.content
|
||||||
|
self.messages.append({
|
||||||
|
"role": "assistant",
|
||||||
|
"content": response_text
|
||||||
|
})
|
||||||
|
|
||||||
|
print("模型响应:")
|
||||||
|
print(response_text)
|
||||||
|
|
||||||
|
# 解析并执行工具调用
|
||||||
|
tool_calls = self.parse_tool_calls(response_text)
|
||||||
|
if tool_calls:
|
||||||
|
print("\n检测到工具调用:")
|
||||||
|
for call in tool_calls:
|
||||||
|
print(f"- {call['name']}({', '.join([f'{k}={v}' for k, v in call['arguments'].items()])})")
|
||||||
|
|
||||||
|
tool_result = self.execute_tool_calls(tool_calls)
|
||||||
|
print(f"\n工具执行结果:")
|
||||||
|
print(tool_result)
|
||||||
|
|
||||||
|
# 将工具执行结果添加到消息历史中
|
||||||
|
self.messages.append({
|
||||||
|
"role": "user",
|
||||||
|
"content": f"工具执行结果:\n{tool_result}"
|
||||||
|
})
|
||||||
|
|
||||||
|
# 短暂等待,让操作生效
|
||||||
|
time.sleep(3)
|
||||||
|
else:
|
||||||
|
print("未检测到工具调用,任务可能已完成")
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"调用模型时发生错误: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"\n任务执行完成,共执行 {step} 步")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
交互式主函数
|
||||||
|
"""
|
||||||
|
# 获取API密钥
|
||||||
|
print("=== VLM 电脑操作工具 ===")
|
||||||
|
print("欢迎使用qwen3VL电脑操作工具")
|
||||||
|
print("您需要一个阿里云API密钥才能使用此工具")
|
||||||
|
print("获取地址: https://www.aliyun.com/")
|
||||||
|
api_key = input("请输入您的阿里云API密钥: ").strip()
|
||||||
|
|
||||||
|
if not api_key or api_key == "sk-your-api-key":
|
||||||
|
print("错误: 请输入有效的阿里云API密钥")
|
||||||
|
print("请访问阿里云控制台获取API密钥")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 初始化代理
|
||||||
|
agent = VLMAgent(api_key)
|
||||||
|
|
||||||
|
print("\n系统已就绪,您可以输入各种任务请求")
|
||||||
|
print("示例任务:")
|
||||||
|
print(" - 打开记事本并输入'Hello World'")
|
||||||
|
print(" - 在浏览器中搜索'人工智能'")
|
||||||
|
print(" - 创建一个名为'test.txt'的文件")
|
||||||
|
print("输入'退出'、'exit'或'quit'结束程序")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# 获取用户输入
|
||||||
|
task = input("\n请输入任务: ").strip()
|
||||||
|
|
||||||
|
# 检查退出条件
|
||||||
|
if task.lower() in ['退出', 'exit', 'quit', 'q']:
|
||||||
|
print("程序结束,再见!")
|
||||||
|
break
|
||||||
|
|
||||||
|
# 检查空输入
|
||||||
|
if not task:
|
||||||
|
print("请输入有效的任务")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 执行任务
|
||||||
|
print(f"\n开始执行任务: {task}")
|
||||||
|
agent.run_task(task)
|
||||||
|
print(f"\n任务 '{task}' 执行完成")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 检查必要的依赖
|
||||||
|
try:
|
||||||
|
import pyautogui
|
||||||
|
import PIL
|
||||||
|
import tkinter
|
||||||
|
except ImportError as e:
|
||||||
|
print(f"缺少必要的依赖包: {e}")
|
||||||
|
print("请安装依赖: pip install pyautogui pillow openai")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user