Spaces:

baqr
/

computer_use_ootb

Runtime error

App Files Files Community

baqr commited on 6 days ago

Commit

d73c58e

verified ·

1 Parent(s): d50be28

Upload folder using huggingface_hub

Browse files

Files changed (48) hide show

.gitattributes +10 -0
.gitignore +18 -0
LICENSE +190 -0
README.md +269 -8
app.py +633 -0
assets/Teaser.gif +3 -0
assets/examples/init_states/amazon.png +3 -0
assets/examples/init_states/booking.png +3 -0
assets/examples/init_states/honkai_star_rail.png +3 -0
assets/examples/init_states/honkai_star_rail_showui.png +3 -0
assets/examples/init_states/ign.png +3 -0
assets/examples/init_states/powerpoint.png +3 -0
assets/examples/init_states/powerpoint_homepage.png +3 -0
assets/examples/ootb_examples.json +73 -0
assets/gradio_interface.png +3 -0
assets/ootb_icon.png +0 -0
assets/ootb_logo.png +0 -0
assets/wechat_3.jpg +3 -0
computer_use_demo/__init__.py +0 -0
computer_use_demo/executor/anthropic_executor.py +135 -0
computer_use_demo/executor/showui_executor.py +376 -0
computer_use_demo/gui_agent/actor/showui_agent.py +178 -0
computer_use_demo/gui_agent/actor/uitars_agent.py +169 -0
computer_use_demo/gui_agent/llm_utils/llm_utils.py +109 -0
computer_use_demo/gui_agent/llm_utils/oai.py +218 -0
computer_use_demo/gui_agent/llm_utils/qwen.py +108 -0
computer_use_demo/gui_agent/llm_utils/run_llm.py +44 -0
computer_use_demo/gui_agent/planner/anthropic_agent.py +206 -0
computer_use_demo/gui_agent/planner/api_vlm_planner.py +305 -0
computer_use_demo/gui_agent/planner/local_vlm_planner.py +235 -0
computer_use_demo/loop.py +276 -0
computer_use_demo/remote_inference.py +453 -0
computer_use_demo/tools/__init__.py +16 -0
computer_use_demo/tools/base.py +69 -0
computer_use_demo/tools/bash.py +136 -0
computer_use_demo/tools/collection.py +41 -0
computer_use_demo/tools/colorful_text.py +27 -0
computer_use_demo/tools/computer.py +621 -0
computer_use_demo/tools/edit.py +290 -0
computer_use_demo/tools/logger.py +21 -0
computer_use_demo/tools/run.py +42 -0
computer_use_demo/tools/screen_capture.py +171 -0
dev-requirements.txt +23 -0
docs/README_cn.md +172 -0
install_tools/install_showui-awq-4bit.py +17 -0
install_tools/install_showui.py +17 -0
install_tools/install_uitars-2b-sft.py +17 -0
install_tools/test_ui-tars_server.py +82 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/Teaser.gif filter=lfs diff=lfs merge=lfs -text
+assets/examples/init_states/amazon.png filter=lfs diff=lfs merge=lfs -text
+assets/examples/init_states/booking.png filter=lfs diff=lfs merge=lfs -text
+assets/examples/init_states/honkai_star_rail.png filter=lfs diff=lfs merge=lfs -text
+assets/examples/init_states/honkai_star_rail_showui.png filter=lfs diff=lfs merge=lfs -text
+assets/examples/init_states/ign.png filter=lfs diff=lfs merge=lfs -text
+assets/examples/init_states/powerpoint.png filter=lfs diff=lfs merge=lfs -text
+assets/examples/init_states/powerpoint_homepage.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio_interface.png filter=lfs diff=lfs merge=lfs -text
+assets/wechat_3.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+.venv
+.ruff_cache
+__pycache__
+.pytest_cache
+.cache
+.ipynb_checkpoints
+.ipynb
+.DS_Store
+/tmp
+/.gradio
+/.zed
+/showui*
+/ui-tars*
+/demo
+/Qwen*
+/install_tools/install_qwen*
+/dev_tools*
+test.ipynb

LICENSE ADDED Viewed

	@@ -0,0 +1,190 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   Copyright [2024] [Show Lab Computer-Use-OOTB Team]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,273 @@
 ---
-title: Computer Use Ootb
-emoji: 👀
-colorFrom: pink
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.16.2
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: computer_use_ootb
 app_file: app.py
+sdk: gradio
+sdk_version: 5.13.2
 ---
+<h2 align="center">
+    <a href="https://computer-use-ootb.github.io">
+        <img src="./assets/ootb_logo.png" alt="Logo" style="display: block; margin: 0 auto; filter: invert(1) brightness(2);">
+    </a>
+</h2>
+<h5 align="center"> If you like our project, please give us a star ⭐ on GitHub for the latest update.</h5>
+<h5 align=center>
+[![arXiv](https://img.shields.io/badge/Arxiv-2411.10323-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2411.10323)
+[![Project Page](https://img.shields.io/badge/Project_Page-GUI_Agent-blue)](https://computer-use-ootb.github.io)
+[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fshowlab%2Fcomputer_use_ootb&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fshowlab%2Fcomputer_use_ootb&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)
+</h5>
+## <img src="./assets/ootb_icon.png" alt="Star" style="height:25px; vertical-align:middle; filter: invert(1) brightness(2);">  Overview
+**Computer Use <span style="color:rgb(106, 158, 210)">O</span><span style="color:rgb(111, 163, 82)">O</span><span style="color:rgb(209, 100, 94)">T</span><span style="color:rgb(238, 171, 106)">B</span>**<img src="./assets/ootb_icon.png" alt="Star" style="height:20px; vertical-align:middle; filter: invert(1) brightness(2);"> is an out-of-the-box (OOTB) solution for Desktop GUI Agent, including API-based (**Claude 3.5 Computer Use**) and locally-running models (**<span style="color:rgb(106, 158, 210)">S</span><span style="color:rgb(111, 163, 82)">h</span><span style="color:rgb(209, 100, 94)">o</span><span style="color:rgb(238, 171, 106)">w</span>UI**, **UI-TARS**).
+**No Docker** is required, and it supports both **Windows** and **macOS**. OOTB provides a user-friendly interface based on Gradio. 🎨
+Visit our study on GUI Agent of Claude 3.5 Computer Use [[project page]](https://computer-use-ootb.github.io). 🌐
+## Update
+- **[2025/02/08]** We've added the support for [**UI-TARS**](https://github.com/bytedance/UI-TARS). Follow [Cloud Deployment](https://github.com/bytedance/UI-TARS?tab=readme-ov-file#cloud-deployment) or [VLLM deployment](https://github.com/bytedance/UI-TARS?tab=readme-ov-file#local-deployment-vllm) to implement UI-TARS and run it locally in OOTB.
+- **Major Update! [2024/12/04]** **Local Run🔥** is now live! Say hello to [**<span style="color:rgb(106, 158, 210)">S</span><span style="color:rgb(111, 163, 82)">h</span><span style="color:rgb(209, 100, 94)">o</span><span style="color:rgb(238, 171, 106)">w</span>UI**](https://github.com/showlab/ShowUI), an open-source 2B vision-language-action (VLA) model for GUI Agent. Now compatible with `"gpt-4o + ShowUI" (~200x cheaper)`*  & `"Qwen2-VL + ShowUI" (~30x cheaper)`* for only few cents for each task💰! <span style="color: grey; font-size: small;">*compared to Claude Computer Use</span>.
+- **[2024/11/20]** We've added some examples to help you get hands-on experience with Claude 3.5 Computer Use.
+- **[2024/11/19]** Forget about the single-display limit set by Anthropic - you can now use **multiple displays** 🎉!
+- **[2024/11/18]** We've released a deep analysis of Claude 3.5 Computer Use: [https://arxiv.org/abs/2411.10323](https://arxiv.org/abs/2411.10323).
+- **[2024/11/11]** Forget about the low-resolution display limit set by Anthropic — you can now use *any resolution you like* and still keep the **screenshot token cost low** 🎉!
+- **[2024/11/11]** Now both **Windows** and **macOS** platforms are supported 🎉!
+- **[2024/10/25]** Now you can **Remotely Control** your computer 💻 through your mobile device 📱 — **No Mobile App Installation** required! Give it a try and have fun 🎉.
+## Demo Video
+https://github.com/user-attachments/assets/f50b7611-2350-4712-af9e-3d31e30020ee
+<div style="display: flex; justify-content: space-around;">
+  <a href="https://youtu.be/Ychd-t24HZw" target="_blank" style="margin-right: 10px;">
+    <img src="https://img.youtube.com/vi/Ychd-t24HZw/maxresdefault.jpg" alt="Watch the video" width="48%">
+  </a>
+  <a href="https://youtu.be/cvgPBazxLFM" target="_blank">
+    <img src="https://img.youtube.com/vi/cvgPBazxLFM/maxresdefault.jpg" alt="Watch the video" width="48%">
+  </a>
+</div>
+## 🚀 Getting Started
+### 0. Prerequisites
+- Instal Miniconda on your system through this [link](https://www.anaconda.com/download?utm_source=anacondadocs&utm_medium=documentation&utm_campaign=download&utm_content=topnavalldocs). (**Python Version: >= 3.12**).
+- Hardware Requirements (optional, for ShowUI local-run):
+    - **Windows (CUDA-enabled):** A compatible NVIDIA GPU with CUDA support, >=6GB GPU memory
+    - **macOS (Apple Silicon):** M1 chip (or newer), >=16GB unified RAM
+### 1. Clone the Repository 📂
+Open the Conda Terminal. (After installation Of Miniconda, it will appear in the Start menu.)
+Run the following command on **Conda Terminal**.
+```bash
+git clone https://github.com/showlab/computer_use_ootb.git
+cd computer_use_ootb
+```
+### 2.1 Install Dependencies 🔧
+```bash
+pip install -r dev-requirements.txt
+```
+### 2.2 (Optional) Get Prepared for **<span style="color:rgb(106, 158, 210)">S</span><span style="color:rgb(111, 163, 82)">h</span><span style="color:rgb(209, 100, 94)">o</span><span style="color:rgb(238, 171, 106)">w</span>UI** Local-Run
+1. Download all files of the ShowUI-2B model via the following command. Ensure the `ShowUI-2B` folder is under the `computer_use_ootb` folder.
+    ```python
+    python install_tools/install_showui.py
+    ```
+2. Make sure to install the correct GPU version of PyTorch (CUDA, MPS, etc.) on your machine. See [install guide and verification](https://pytorch.org/get-started/locally/).
+3. Get API Keys for [GPT-4o](https://platform.openai.com/docs/quickstart) or [Qwen-VL](https://help.aliyun.com/zh/dashscope/developer-reference/acquisition-and-configuration-of-api-key). For mainland China users, Qwen API free trial for first 1 mil tokens is [available](https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api).
+### 2.3 (Optional) Get Prepared for **UI-TARS** Local-Run
+1. Follow [Cloud Deployment](https://github.com/bytedance/UI-TARS?tab=readme-ov-file#cloud-deployment) or [VLLM deployment](https://github.com/bytedance/UI-TARS?tab=readme-ov-file#local-deployment-vllm) guides to deploy your UI-TARS server.
+2. Test your UI-TARS sever with the script `.\install_tools\test_ui-tars_server.py`.
+### 2.4 (Optional) If you want to deploy Qwen model as planner on ssh server
+1. git clone this project on your ssh server
+2. python computer_use_demo/remote_inference.py
+### 3. Start the Interface ▶️
+**Start the OOTB interface:**
+```bash
+python app.py
+```
+If you successfully start the interface, you will see two URLs in the terminal:
+```bash
+* Running on local URL:  http://127.0.0.1:7860
+* Running on public URL: https://xxxxxxxxxxxxxxxx.gradio.live (Do not share this link with others, or they will be able to control your computer.)
+```
+> <u>For convenience</u>, we recommend running one or more of the following command to set API keys to the environment variables before starting the interface. Then you don’t need to manually pass the keys each run. On Windows Powershell (via the `set` command if on cmd):
+> ```bash
+> $env:ANTHROPIC_API_KEY="sk-xxxxx" (Replace with your own key)
+> $env:QWEN_API_KEY="sk-xxxxx"
+> $env:OPENAI_API_KEY="sk-xxxxx"
+> ```
+> On macOS/Linux, replace `$env:ANTHROPIC_API_KEY` with `export ANTHROPIC_API_KEY` in the above command.
+### 4. Control Your Computer with Any Device can Access the Internet
+- **Computer to be controlled**: The one installed software.
+- **Device Send Command**: The one opens the website.
+Open the website at http://localhost:7860/ (if you're controlling the computer itself) or https://xxxxxxxxxxxxxxxxx.gradio.live in your mobile browser for remote control.
+Enter the Anthropic API key (you can obtain it through this [website](https://console.anthropic.com/settings/keys)), then give commands to let the AI perform your tasks.
+### ShowUI Advanced Settings
+We provide a 4-bit quantized ShowUI-2B model for cost-efficient inference (currently **only support CUDA devices**). To download the 4-bit quantized ShowUI-2B model:
+```
+python install_tools/install_showui-awq-4bit.py
+```
+Then, enable the quantized setting in the 'ShowUI Advanced Settings' dropdown menu.
+Besides, we also provide a slider to quickly adjust the `max_pixel` parameter in the ShowUI model. This controls the visual input size of the model and greatly affects the memory and inference speed.
+## 📊 GUI Agent Model Zoo
+Now, OOTB supports customizing the GUI Agent via the following models:
+- **Unified Model**: Unified planner & actor, can both make the high-level planning and take the low-level control.
+- **Planner**: General-purpose LLMs, for handling the high-level planning and decision-making.
+- **Actor**: Vision-language-action models, for handling the low-level control and action command generation.
+<div align="center">
+  <b>Supported GUI Agent Models, OOTB</b>
+</div>
+<table align="center">
+  <tbody>
+    <tr align="center" valign="bottom">
+      <td>
+        <b>[API] Unified Model</b>
+      </td>
+      <td>
+        <b>[API] Planner</b>
+      </td>
+      <td>
+        <b>[Local] Planner</b>
+      </td>
+      <td>
+        <b>[API] Actor</b>
+      </td>
+      <td>
+        <b>[Local] Actor</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+        <ul>
+            <li><a href="">Claude 3.5 Sonnet</a></li>
+      </ul>
+      </td>
+      <td>
+        <ul>
+          <li><a href="">GPT-4o</a></li>
+          <li><a href="">Qwen2-VL-Max</a></li>
+          <li><a href="">Qwen2-VL-2B(ssh)</a></li>
+          <li><a href="">Qwen2-VL-7B(ssh)</a></li>
+          <li><a href="">Qwen2.5-VL-7B(ssh)</a></li>
+          <li><a href="">Deepseek V3 (soon)</a></li>
+        </ul>
+      </td>
+      <td>
+        <ul>
+          <li><a href="">Qwen2-VL-2B</a></li>
+          <li><a href="">Qwen2-VL-7B</a></li>
+        </ul>
+      </td>
+        <td>
+        <ul>
+          <li><a href="https://github.com/showlab/ShowUI">ShowUI</a></li>
+          <li><a href="https://huggingface.co/bytedance-research/UI-TARS-7B-DPO">UI-TARS-7B/72B-DPO (soon)</a></li>
+        </ul>
+      </td>
+      <td>
+        <ul>
+          <li><a href="https://github.com/showlab/ShowUI">ShowUI</a></li>
+          <li><a href="https://huggingface.co/bytedance-research/UI-TARS-7B-DPO">UI-TARS-7B/72B-DPO</a></li>
+        </ul>
+      </td>
+    </tr>
+</td>
+</table>
+> where [API] models are based on API calling the LLMs that can inference remotely,
+and [Local] models can use your own device that inferences locally with no API costs.
+## 🖥️ Supported Systems
+- **Windows** (Claude ✅, ShowUI ✅)
+- **macOS** (Claude ✅, ShowUI ✅)
+## 👓 OOTB Iterface
+<div style="display: flex; align-items: center; gap: 10px;">
+  <figure style="text-align: center;">
+    <img src="./assets/gradio_interface.png" alt="Desktop Interface" style="width: auto; object-fit: contain;">
+  </figure>
+</div>
+## ⚠️ Risks
+- **Potential Dangerous Operations by the Model**: The models' performance is still limited and may generate unintended or potentially harmful outputs. Recommend continuously monitoring the AI's actions.
+- **Cost Control**: Each task may cost a few dollars for Claude 3.5 Computer Use.💸
+## 📅 Roadmap
+- [ ] **Explore available features**
+  - [ ] The Claude API seems to be unstable when solving tasks. We are investigating the reasons: resolutions, types of actions required, os platforms, or planning mechanisms. Welcome any thoughts or comments on it.
+- [ ] **Interface Design**
+  - [x] **Support for Gradio** ✨
+  - [ ] **Simpler Installation**
+  - [ ] **More Features**... 🚀
+- [ ] **Platform**
+  - [x] **Windows**
+  - [x] **macOS**
+  - [x] **Mobile** (Send command)
+  - [ ] **Mobile** (Be controlled)
+- [ ] **Support for More MLLMs**
+  - [x] **Claude 3.5 Sonnet** 🎵
+  - [x] **GPT-4o**
+  - [x] **Qwen2-VL**
+  - [ ] **Local MLLMs**
+  - [ ] ...
+- [ ] **Improved Prompting Strategy**
+  - [ ] Optimize prompts for cost-efficiency. 💡
+- [x] **Improved Inference Speed**
+  - [x] Support int4 Quantization.
+## Join Discussion
+Welcome to discuss with us and continuously improve the user experience of Computer Use - OOTB. Reach us using this [**Discord Channel**](https://discord.gg/vMMJTSew37) or the WeChat QR code below!
+<div style="display: flex; flex-direction: row; justify-content: space-around;">
+<!-- <img src="./assets/wechat_2.jpg" alt="gradio_interface" width="30%"> -->
+<img src="./assets/wechat_3.jpg" alt="gradio_interface" width="30%">
+</div>
+<div style="height: 30px;"></div>
+<hr>
+<a href="https://computer-use-ootb.github.io">
+<img src="./assets/ootb_logo.png" alt="Logo" width="30%" style="display: block; margin: 0 auto; filter: invert(1) brightness(2);">
+</a>

app.py ADDED Viewed

	@@ -0,0 +1,633 @@

+"""
+Entrypoint for Gradio, see https://gradio.app/
+"""
+import platform
+import asyncio
+import base64
+import os
+import io
+import json
+from datetime import datetime
+from enum import StrEnum
+from functools import partial
+from pathlib import Path
+from typing import cast, Dict
+from PIL import Image
+import gradio as gr
+from anthropic import APIResponse
+from anthropic.types import TextBlock
+from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
+from anthropic.types.tool_use_block import ToolUseBlock
+from screeninfo import get_monitors
+from computer_use_demo.tools.logger import logger, truncate_string
+logger.info("Starting the gradio app")
+screens = get_monitors()
+logger.info(f"Found {len(screens)} screens")
+from computer_use_demo.loop import APIProvider, sampling_loop_sync
+from computer_use_demo.tools import ToolResult
+from computer_use_demo.tools.computer import get_screen_details
+SCREEN_NAMES, SELECTED_SCREEN_INDEX = get_screen_details()
+API_KEY_FILE = "./api_keys.json"
+WARNING_TEXT = "⚠️ Security Alert: Do not provide access to sensitive accounts or data, as malicious web content can hijack Agent's behavior. Keep monitor on the Agent's actions."
+def setup_state(state):
+    if "messages" not in state:
+        state["messages"] = []
+    # -------------------------------
+    if "planner_model" not in state:
+        state["planner_model"] = "gpt-4o"  # default
+    if "actor_model" not in state:
+        state["actor_model"] = "ShowUI"    # default
+    if "planner_provider" not in state:
+        state["planner_provider"] = "openai"  # default
+    if "actor_provider" not in state:
+        state["actor_provider"] = "local"    # default
+     # Fetch API keys from environment variables
+    if "openai_api_key" not in state:
+        state["openai_api_key"] = os.getenv("OPENAI_API_KEY", "")
+    if "anthropic_api_key" not in state:
+        state["anthropic_api_key"] = os.getenv("ANTHROPIC_API_KEY", "")
+    if "qwen_api_key" not in state:
+        state["qwen_api_key"] = os.getenv("QWEN_API_KEY", "")
+    if "ui_tars_url" not in state:
+        state["ui_tars_url"] = ""
+    # Set the initial api_key based on the provider
+    if "planner_api_key" not in state:
+        if state["planner_provider"] == "openai":
+            state["planner_api_key"] = state["openai_api_key"]
+        elif state["planner_provider"] == "anthropic":
+            state["planner_api_key"] = state["anthropic_api_key"]
+        elif state["planner_provider"] == "qwen":
+            state["planner_api_key"] = state["qwen_api_key"]
+        else:
+            state["planner_api_key"] = ""
+    logger.info(f"loaded initial api_key for {state['planner_provider']}: {state['planner_api_key']}")
+    if not state["planner_api_key"]:
+        logger.warning("Planner API key not found. Please set it in the environment or paste in textbox.")
+    if "selected_screen" not in state:
+        state['selected_screen'] = SELECTED_SCREEN_INDEX if SCREEN_NAMES else 0
+    if "auth_validated" not in state:
+        state["auth_validated"] = False
+    if "responses" not in state:
+        state["responses"] = {}
+    if "tools" not in state:
+        state["tools"] = {}
+    if "only_n_most_recent_images" not in state:
+        state["only_n_most_recent_images"] = 10 # 10
+    if "custom_system_prompt" not in state:
+        state["custom_system_prompt"] = ""
+        # remove if want to use default system prompt
+        device_os_name = "Windows" if platform.system() == "Windows" else "Mac" if platform.system() == "Darwin" else "Linux"
+        state["custom_system_prompt"] += f"\n\nNOTE: you are operating a {device_os_name} machine"
+    if "hide_images" not in state:
+        state["hide_images"] = False
+    if 'chatbot_messages' not in state:
+        state['chatbot_messages'] = []
+    if "showui_config" not in state:
+        state["showui_config"] = "Default"
+    if "max_pixels" not in state:
+        state["max_pixels"] = 1344
+    if "awq_4bit" not in state:
+        state["awq_4bit"] = False
+async def main(state):
+    """Render loop for Gradio"""
+    setup_state(state)
+    return "Setup completed"
+def validate_auth(provider: APIProvider, api_key: str | None):
+    if provider == APIProvider.ANTHROPIC:
+        if not api_key:
+            return "Enter your Anthropic API key to continue."
+    if provider == APIProvider.BEDROCK:
+        import boto3
+        if not boto3.Session().get_credentials():
+            return "You must have AWS credentials set up to use the Bedrock API."
+    if provider == APIProvider.VERTEX:
+        import google.auth
+        from google.auth.exceptions import DefaultCredentialsError
+        if not os.environ.get("CLOUD_ML_REGION"):
+            return "Set the CLOUD_ML_REGION environment variable to use the Vertex API."
+        try:
+            google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
+        except DefaultCredentialsError:
+            return "Your google cloud credentials are not set up correctly."
+def _api_response_callback(response: APIResponse[BetaMessage], response_state: dict):
+    response_id = datetime.now().isoformat()
+    response_state[response_id] = response
+def _tool_output_callback(tool_output: ToolResult, tool_id: str, tool_state: dict):
+    tool_state[tool_id] = tool_output
+def chatbot_output_callback(message, chatbot_state, hide_images=False, sender="bot"):
+    def _render_message(message: str | BetaTextBlock | BetaToolUseBlock | ToolResult, hide_images=False):
+        logger.info(f"_render_message: {str(message)[:100]}")
+        if isinstance(message, str):
+            return message
+        is_tool_result = not isinstance(message, str) and (
+            isinstance(message, ToolResult)
+            or message.__class__.__name__ == "ToolResult"
+            or message.__class__.__name__ == "CLIResult"
+        )
+        if not message or (
+            is_tool_result
+            and hide_images
+            and not hasattr(message, "error")
+            and not hasattr(message, "output")
+        ):  # return None if hide_images is True
+            return
+        # render tool result
+        if is_tool_result:
+            message = cast(ToolResult, message)
+            if message.output:
+                return message.output
+            if message.error:
+                return f"Error: {message.error}"
+            if message.base64_image and not hide_images:
+                # somehow can't display via gr.Image
+                # image_data = base64.b64decode(message.base64_image)
+                # return gr.Image(value=Image.open(io.BytesIO(image_data)))
+                return f'<img src="data:image/png;base64,{message.base64_image}">'
+        elif isinstance(message, BetaTextBlock) or isinstance(message, TextBlock):
+            return message.text
+        elif isinstance(message, BetaToolUseBlock) or isinstance(message, ToolUseBlock):
+            return f"Tool Use: {message.name}\nInput: {message.input}"
+        else:
+            return message
+    # processing Anthropic messages
+    message = _render_message(message, hide_images)
+    if sender == "bot":
+        chatbot_state.append((None, message))
+    else:
+        chatbot_state.append((message, None))
+    # Create a concise version of the chatbot state for logging
+    concise_state = [(truncate_string(user_msg), truncate_string(bot_msg)) for user_msg, bot_msg in chatbot_state]
+    logger.info(f"chatbot_output_callback chatbot_state: {concise_state} (truncated)")
+def process_input(user_input, state):
+    setup_state(state)
+    # Append the user message to state["messages"]
+    state["messages"].append(
+            {
+                "role": "user",
+                "content": [TextBlock(type="text", text=user_input)],
+            }
+        )
+    # Append the user's message to chatbot_messages with None for the assistant's reply
+    state['chatbot_messages'].append((user_input, None))
+    yield state['chatbot_messages']  # Yield to update the chatbot UI with the user's message
+    # Run sampling_loop_sync with the chatbot_output_callback
+    for loop_msg in sampling_loop_sync(
+        system_prompt_suffix=state["custom_system_prompt"],
+        planner_model=state["planner_model"],
+        planner_provider=state["planner_provider"],
+        actor_model=state["actor_model"],
+        actor_provider=state["actor_provider"],
+        messages=state["messages"],
+        output_callback=partial(chatbot_output_callback, chatbot_state=state['chatbot_messages'], hide_images=state["hide_images"]),
+        tool_output_callback=partial(_tool_output_callback, tool_state=state["tools"]),
+        api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
+        api_key=state["planner_api_key"],
+        only_n_most_recent_images=state["only_n_most_recent_images"],
+        selected_screen=state['selected_screen'],
+        showui_max_pixels=state['max_pixels'],
+        showui_awq_4bit=state['awq_4bit']
+    ):
+        if loop_msg is None:
+            yield state['chatbot_messages']
+            logger.info("End of task. Close the loop.")
+            break
+        yield state['chatbot_messages']  # Yield the updated chatbot_messages to update the chatbot UI
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    state = gr.State({})  # Use Gradio's state management
+    setup_state(state.value)  # Initialize the state
+    # Retrieve screen details
+    gr.Markdown("# Computer Use OOTB")
+    if not os.getenv("HIDE_WARNING", False):
+        gr.Markdown(WARNING_TEXT)
+    with gr.Accordion("Settings", open=True):
+        with gr.Row():
+            with gr.Column():
+                # --------------------------
+                # Planner
+                planner_model = gr.Dropdown(
+                    label="Planner Model",
+                    choices=["gpt-4o",
+                             "gpt-4o-mini",
+                             "qwen2-vl-max",
+                             "qwen2-vl-2b (local)",
+                             "qwen2-vl-7b (local)",
+                             "qwen2-vl-2b (ssh)",
+                             "qwen2-vl-7b (ssh)",
+                             "qwen2.5-vl-7b (ssh)",
+                             "claude-3-5-sonnet-20241022"],
+                    value="gpt-4o",
+                    interactive=True,
+                )
+            with gr.Column():
+                planner_api_provider = gr.Dropdown(
+                    label="API Provider",
+                    choices=[option.value for option in APIProvider],
+                    value="openai",
+                    interactive=False,
+                )
+            with gr.Column():
+                planner_api_key = gr.Textbox(
+                    label="Planner API Key",
+                    type="password",
+                    value=state.value.get("planner_api_key", ""),
+                    placeholder="Paste your planner model API key",
+                    interactive=True,
+                )
+            with gr.Column():
+                actor_model = gr.Dropdown(
+                    label="Actor Model",
+                    choices=["ShowUI", "UI-TARS"],
+                    value="ShowUI",
+                    interactive=True,
+                )
+            with gr.Column():
+                custom_prompt = gr.Textbox(
+                    label="System Prompt Suffix",
+                    value="",
+                    interactive=True,
+                )
+            with gr.Column():
+                screen_options, primary_index = get_screen_details()
+                SCREEN_NAMES = screen_options
+                SELECTED_SCREEN_INDEX = primary_index
+                screen_selector = gr.Dropdown(
+                    label="Select Screen",
+                    choices=screen_options,
+                    value=screen_options[primary_index] if screen_options else None,
+                    interactive=True,
+                )
+            with gr.Column():
+                only_n_images = gr.Slider(
+                    label="N most recent screenshots",
+                    minimum=0,
+                    maximum=10,
+                    step=1,
+                    value=2,
+                    interactive=True,
+                )
+    with gr.Accordion("ShowUI Advanced Settings", open=False):
+        gr.Markdown("""
+                    **Note:** Adjust these settings to fine-tune the resource (**memory** and **infer time**) and performance trade-offs of ShowUI. \\
+                    Quantization model requires additional download. Please refer to [Computer Use OOTB - #ShowUI Advanced Settings guide](https://github.com/showlab/computer_use_ootb?tab=readme-ov-file#showui-advanced-settings) for preparation for this feature.
+                    """)
+        # New configuration for ShowUI
+        with gr.Row():
+            with gr.Column():
+                showui_config = gr.Dropdown(
+                    label="ShowUI Preset Configuration",
+                    choices=["Default (Maximum)", "Medium", "Minimal", "Custom"],
+                    value="Default (Maximum)",
+                    interactive=True,
+                )
+            with gr.Column():
+                max_pixels = gr.Slider(
+                    label="Max Visual Tokens",
+                    minimum=720,
+                    maximum=1344,
+                    step=16,
+                    value=1344,
+                    interactive=False,
+                )
+            with gr.Column():
+                awq_4bit = gr.Checkbox(
+                    label="Enable AWQ-4bit Model",
+                    value=False,
+                    interactive=False
+                )
+    # Define the merged dictionary with task mappings
+    merged_dict = json.load(open("assets/examples/ootb_examples.json", "r"))
+    def update_only_n_images(only_n_images_value, state):
+        state["only_n_most_recent_images"] = only_n_images_value
+    # Callback to update the second dropdown based on the first selection
+    def update_second_menu(selected_category):
+        return gr.update(choices=list(merged_dict.get(selected_category, {}).keys()))
+    # Callback to update the third dropdown based on the second selection
+    def update_third_menu(selected_category, selected_option):
+        return gr.update(choices=list(merged_dict.get(selected_category, {}).get(selected_option, {}).keys()))
+    # Callback to update the textbox based on the third selection
+    def update_textbox(selected_category, selected_option, selected_task):
+        task_data = merged_dict.get(selected_category, {}).get(selected_option, {}).get(selected_task, {})
+        prompt = task_data.get("prompt", "")
+        preview_image = task_data.get("initial_state", "")
+        task_hint = "Task Hint: " + task_data.get("hint", "")
+        return prompt, preview_image, task_hint
+    # Function to update the global variable when the dropdown changes
+    def update_selected_screen(selected_screen_name, state):
+        global SCREEN_NAMES
+        global SELECTED_SCREEN_INDEX
+        SELECTED_SCREEN_INDEX = SCREEN_NAMES.index(selected_screen_name)
+        logger.info(f"Selected screen updated to: {SELECTED_SCREEN_INDEX}")
+        state['selected_screen'] = SELECTED_SCREEN_INDEX
+    def update_planner_model(model_selection, state):
+        state["model"] = model_selection
+        # Update planner_model
+        state["planner_model"] = model_selection
+        logger.info(f"Model updated to: {state['planner_model']}")
+        if model_selection == "qwen2-vl-max":
+            provider_choices = ["qwen"]
+            provider_value = "qwen"
+            provider_interactive = False
+            api_key_interactive = True
+            api_key_placeholder = "qwen API key"
+            actor_model_choices = ["ShowUI", "UI-TARS"]
+            actor_model_value = "ShowUI"
+            actor_model_interactive = True
+            api_key_type = "password"  # Display API key in password form
+        elif model_selection == "qwen2-vl-2b (local)" or model_selection == "qwen2-vl-7b (local)":
+            # Set provider to "openai", make it unchangeable
+            provider_choices = ["local"]
+            provider_value = "local"
+            provider_interactive = False
+            api_key_interactive = False
+            api_key_placeholder = "not required"
+            actor_model_choices = ["ShowUI", "UI-TARS"]
+            actor_model_value = "ShowUI"
+            actor_model_interactive = True
+            api_key_type = "password"  # Maintain consistency
+        elif "ssh" in model_selection:
+            provider_choices = ["ssh"]
+            provider_value = "ssh"
+            provider_interactive = False
+            api_key_interactive = True
+            api_key_placeholder = "ssh host and port (e.g. localhost:8000)"
+            actor_model_choices = ["ShowUI", "UI-TARS"]
+            actor_model_value = "ShowUI"
+            actor_model_interactive = True
+            api_key_type = "text"  # Display SSH connection info in plain text
+            # If SSH connection info already exists, keep it
+            if "planner_api_key" in state and state["planner_api_key"]:
+                state["api_key"] = state["planner_api_key"]
+            else:
+                state["api_key"] = ""
+        elif model_selection == "gpt-4o" or model_selection == "gpt-4o-mini":
+            # Set provider to "openai", make it unchangeable
+            provider_choices = ["openai"]
+            provider_value = "openai"
+            provider_interactive = False
+            api_key_interactive = True
+            api_key_type = "password"  # Display API key in password form
+            api_key_placeholder = "openai API key"
+            actor_model_choices = ["ShowUI", "UI-TARS"]
+            actor_model_value = "ShowUI"
+            actor_model_interactive = True
+        elif model_selection == "claude-3-5-sonnet-20241022":
+            # Provider can be any of the current choices except 'openai'
+            provider_choices = [option.value for option in APIProvider if option.value != "openai"]
+            provider_value = "anthropic"  # Set default to 'anthropic'
+            provider_interactive = True
+            api_key_interactive = True
+            api_key_placeholder = "claude API key"
+            actor_model_choices = ["claude-3-5-sonnet-20241022"]
+            actor_model_value = "claude-3-5-sonnet-20241022"
+            actor_model_interactive = False
+            api_key_type = "password"  # Display API key in password form
+        else:
+            raise ValueError(f"Model {model_selection} not supported")
+        # Update the provider in state
+        state["planner_api_provider"] = provider_value
+        # Update api_key in state based on the provider
+        if provider_value == "openai":
+            state["api_key"] = state.get("openai_api_key", "")
+        elif provider_value == "anthropic":
+            state["api_key"] = state.get("anthropic_api_key", "")
+        elif provider_value == "qwen":
+            state["api_key"] = state.get("qwen_api_key", "")
+        elif provider_value == "local":
+            state["api_key"] = ""
+        # SSH的情况已经在上面处理过了，这里不需要重复处理
+        provider_update = gr.update(
+            choices=provider_choices,
+            value=provider_value,
+            interactive=provider_interactive
+        )
+        # Update the API Key textbox
+        api_key_update = gr.update(
+            placeholder=api_key_placeholder,
+            value=state["api_key"],
+            interactive=api_key_interactive,
+            type=api_key_type  # 添加 type 参数的更新
+        )
+        actor_model_update = gr.update(
+            choices=actor_model_choices,
+            value=actor_model_value,
+            interactive=actor_model_interactive
+        )
+        logger.info(f"Updated state: model={state['planner_model']}, provider={state['planner_api_provider']}, api_key={state['api_key']}")
+        return provider_update, api_key_update, actor_model_update
+    def update_actor_model(actor_model_selection, state):
+        state["actor_model"] = actor_model_selection
+        logger.info(f"Actor model updated to: {state['actor_model']}")
+    def update_api_key_placeholder(provider_value, model_selection):
+        if model_selection == "claude-3-5-sonnet-20241022":
+            if provider_value == "anthropic":
+                return gr.update(placeholder="anthropic API key")
+            elif provider_value == "bedrock":
+                return gr.update(placeholder="bedrock API key")
+            elif provider_value == "vertex":
+                return gr.update(placeholder="vertex API key")
+            else:
+                return gr.update(placeholder="")
+        elif model_selection == "gpt-4o + ShowUI":
+            return gr.update(placeholder="openai API key")
+        else:
+            return gr.update(placeholder="")
+    def update_system_prompt_suffix(system_prompt_suffix, state):
+        state["custom_system_prompt"] = system_prompt_suffix
+    # When showui_config changes, we set the max_pixels and awq_4bit accordingly.
+    def handle_showui_config_change(showui_config_val, state):
+        if showui_config_val == "Default (Maximum)":
+            state["max_pixels"] = 1344
+            state["awq_4bit"] = False
+            return (
+                gr.update(value=1344, interactive=False),
+                gr.update(value=False, interactive=False)
+            )
+        elif showui_config_val == "Medium":
+            state["max_pixels"] = 1024
+            state["awq_4bit"] = False
+            return (
+                gr.update(value=1024, interactive=False),
+                gr.update(value=False, interactive=False)
+            )
+        elif showui_config_val == "Minimal":
+            state["max_pixels"] = 1024
+            state["awq_4bit"] = True
+            return (
+                gr.update(value=1024, interactive=False),
+                gr.update(value=True, interactive=False)
+            )
+        elif showui_config_val == "Custom":
+            # Do not overwrite the current user values, just make them interactive
+            return (
+                gr.update(interactive=True),
+                gr.update(interactive=True)
+            )
+    def update_api_key(api_key_value, state):
+        """Handle API key updates"""
+        state["planner_api_key"] = api_key_value
+        if state["planner_provider"] == "ssh":
+            state["api_key"] = api_key_value
+        logger.info(f"API key updated: provider={state['planner_provider']}, api_key={state['api_key']}")
+    with gr.Accordion("Quick Start Prompt", open=False):  # open=False 表示默认收
+        # Initialize Gradio interface with the dropdowns
+        with gr.Row():
+            # Set initial values
+            initial_category = "Game Play"
+            initial_second_options = list(merged_dict[initial_category].keys())
+            initial_third_options = list(merged_dict[initial_category][initial_second_options[0]].keys())
+            initial_text_value = merged_dict[initial_category][initial_second_options[0]][initial_third_options[0]]
+            with gr.Column(scale=2):
+                # First dropdown for Task Category
+                first_menu = gr.Dropdown(
+                    choices=list(merged_dict.keys()), label="Task Category", interactive=True, value=initial_category
+                )
+                # Second dropdown for Software
+                second_menu = gr.Dropdown(
+                    choices=initial_second_options, label="Software", interactive=True, value=initial_second_options[0]
+                )
+                # Third dropdown for Task
+                third_menu = gr.Dropdown(
+                    choices=initial_third_options, label="Task", interactive=True, value=initial_third_options[0]
+                    # choices=["Please select a task"]+initial_third_options, label="Task", interactive=True, value="Please select a task"
+                )
+            with gr.Column(scale=1):
+                initial_image_value = "./assets/examples/init_states/honkai_star_rail_showui.png"  # default image path
+                image_preview = gr.Image(value=initial_image_value, label="Reference Initial State", height=260-(318.75-280))
+                hintbox = gr.Markdown("Task Hint: Selected options will appear here.")
+        # Textbox for displaying the mapped value
+        # textbox = gr.Textbox(value=initial_text_value, label="Action")
+    # api_key.change(fn=lambda key: save_to_storage(API_KEY_FILE, key), inputs=api_key)
+    with gr.Row():
+        # submit_button = gr.Button("Submit")  # Add submit button
+        with gr.Column(scale=8):
+            chat_input = gr.Textbox(show_label=False, placeholder="Type a message to send to Computer Use OOTB...", container=False)
+        with gr.Column(scale=1, min_width=50):
+            submit_button = gr.Button(value="Send", variant="primary")
+    chatbot = gr.Chatbot(label="Chatbot History", type="tuples", autoscroll=True, height=580)
+    planner_model.change(fn=update_planner_model, inputs=[planner_model, state], outputs=[planner_api_provider, planner_api_key, actor_model])
+    planner_api_provider.change(fn=update_api_key_placeholder, inputs=[planner_api_provider, planner_model], outputs=planner_api_key)
+    actor_model.change(fn=update_actor_model, inputs=[actor_model, state], outputs=None)
+    screen_selector.change(fn=update_selected_screen, inputs=[screen_selector, state], outputs=None)
+    only_n_images.change(fn=update_only_n_images, inputs=[only_n_images, state], outputs=None)
+    # When showui_config changes, we update max_pixels and awq_4bit automatically.
+    showui_config.change(fn=handle_showui_config_change,
+                         inputs=[showui_config, state],
+                         outputs=[max_pixels, awq_4bit])
+    # Link callbacks to update dropdowns based on selections
+    first_menu.change(fn=update_second_menu, inputs=first_menu, outputs=second_menu)
+    second_menu.change(fn=update_third_menu, inputs=[first_menu, second_menu], outputs=third_menu)
+    third_menu.change(fn=update_textbox, inputs=[first_menu, second_menu, third_menu], outputs=[chat_input, image_preview, hintbox])
+    # chat_input.submit(process_input, [chat_input, state], chatbot)
+    submit_button.click(process_input, [chat_input, state], chatbot)
+    planner_api_key.change(
+        fn=update_api_key,
+        inputs=[planner_api_key, state],
+        outputs=None
+    )
+demo.launch(share=True,
+            allowed_paths=["./"],
+            server_port=7888)  # TODO: allowed_paths

assets/Teaser.gif ADDED Viewed

Git LFS Details

SHA256: a91db8629e529ccfb87e08344b3c921d50eba31659a65b8087e3d87e7fb1868f
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

assets/examples/init_states/amazon.png ADDED Viewed

Git LFS Details

SHA256: 00affe0186c7eb26ffbb750408bf1211a4c94a4d4f5a976cf33658eea6546c6e
Pointer size: 132 Bytes
Size of remote file: 4.08 MB

assets/examples/init_states/booking.png ADDED Viewed

Git LFS Details

SHA256: b3fd069d1d27f4609611b2e68822d12f2d97f6baf992e3f6ba4299597c42c8c4
Pointer size: 131 Bytes
Size of remote file: 740 kB

assets/examples/init_states/honkai_star_rail.png ADDED Viewed

Git LFS Details

SHA256: 8c9b874506d259d173fd717f21e3d07e0b18a96fcaf81604131540c8543875d7
Pointer size: 132 Bytes
Size of remote file: 3.69 MB

assets/examples/init_states/honkai_star_rail_showui.png ADDED Viewed

Git LFS Details

SHA256: 356e1292dddc18262f97cf7ef30841b1b5ee1504554694948f2cc7992d96d810
Pointer size: 132 Bytes
Size of remote file: 5.47 MB

assets/examples/init_states/ign.png ADDED Viewed

Git LFS Details

SHA256: 73115cecd7aaf5b47966330a62c68e0042e185a02176740ebbaa0609737c28d2
Pointer size: 132 Bytes
Size of remote file: 5.79 MB

assets/examples/init_states/powerpoint.png ADDED Viewed

Git LFS Details

SHA256: 0bbe880b5687014ca1ea12e15b5d64877532108d7a9e6512c61ddf5524651272
Pointer size: 131 Bytes
Size of remote file: 419 kB

assets/examples/init_states/powerpoint_homepage.png ADDED Viewed

Git LFS Details

SHA256: 8992a580e11bc47c0193067ab7972dcb42c0c72ca2b6cff8583c3b05541589d0
Pointer size: 131 Bytes
Size of remote file: 653 kB

assets/examples/ootb_examples.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+    "Web Navigation": {
+        "Shopping": {
+            "Search Gift Card": {
+                "hint": "Search for 'You are Amazing' congrats gift card",
+                "prompt": "Search for 'You are Amazing' congrats gift card",
+                "initial_state": ".\\assets\\examples\\init_states\\amazon.png"
+            },
+            "Add Headphones": {
+                "hint": "Add a set of wireless headphones to your cart",
+                "prompt": "Add a set of wireless headphones to your cart",
+                "initial_state": ".\\assets\\examples\\init_states\\amazon.png"
+            }
+        },
+        "Accommodation": {
+            "Find Private Room": {
+                "hint": "Find a private room in New York",
+                "prompt": "Find a private room in New York",
+                "initial_state": ".\\assets\\examples\\init_states\\booking.png"
+            }
+        },
+        "Gaming": {
+            "Walk-through Guide": {
+                "hint": "Find a walk-through guide for the game 'Black Myth: Wukong'",
+                "prompt": "Find a walk-through guide for the game 'Black Myth: Wukong'",
+                "initial_state": ".\\assets\\examples\\init_states\\ign.png"
+            }
+        }
+    },
+    "Productivity": {
+        "Presentations": {
+            "Create Presentation": {
+                "hint": "Create a new presentation and set the title to 'Hail Computer Use OOTB!'",
+                "prompt": "Create a new presentation and edit the title to 'Hail Computer Use OOTB!'",
+                "initial_state": ".\\assets\\examples\\init_states\\powerpoint_homepage.png"
+            },
+            "Duplicate First Slide": {
+                "hint": "Duplicate the first slide in PowerPoint",
+                "prompt": "Duplicate the first slide in PowerPoint",
+                "initial_state": ".\\assets\\examples\\init_states\\powerpoint.png"
+            },
+            "Insert Picture": {
+                "hint": "Insert a picture from my device into the current slide, selecting the first image in the photo browser",
+                "prompt": "Insert a picture from my device into the current slide",
+                "initial_state": ".\\assets\\examples\\init_states\\powerpoint.png"
+            },
+            "Apply Morph Transition": {
+                "hint": "Apply the Morph transition to all slides",
+                "prompt": "Apply the Morph transition to all slides",
+                "initial_state": ".\\assets\\examples\\init_states\\powerpoint.png"
+            }
+        }
+    },
+    "Game Play": {
+        "Honkai: Star Rail": {
+            "Daily Task (ShowUI)": {
+                "hint": "Complete the daily task",
+                "prompt": "1. Escape on the keyboard to open the menu. 2. Click 'Interastral Guide'. 3. Then click 'calyx golden for exp' entry. 4. Then click on the 'Teleport of Buds of MEMORIES'. 5. Press the 'bottom plus + button, the one below'. 6. Then click Challenge 7. Then click Start Challenge. 8. Then click on exit when the battle is completed.",
+                "initial_state": ".\\assets\\examples\\init_states\\honkai_star_rail_showui.png"
+            },
+            "Daily Task (Claude 3.5 Computer Use)": {
+                "hint": "Complete the daily task",
+                "prompt": "You are currently playing Honkai: Star Rail, your objective is to finish a daily game task for me. Press escape on the keyboard to open the menu, then click interastral guide, then click 'calyx golden for exp' entry on the left side of the popped up game window. Only then click on the teleport button on the same line of the first entry named 'buds of MEMORIES' (you need to carefully check the name), then click 'plus +' button 5 times to increase attempts to 6, then click challenge, then click start challenge. Then click the auto-battle button at the right-up corner - carefully count from the right to the left, it should be the second icon, it is near the 'pause' icon, it looks like an 'infinite' symbol. Then click on exit when the battle is completed.",
+                "initial_state": ".\\assets\\examples\\init_states\\honkai_star_rail.png"
+            },
+            "Warp": {
+                "hint": "Perform a warp (gacha pull)",
+                "prompt": "You are currently playing Honkai: Star Rail, your objective is to perform a 10-warp pull for me. Press escape on the keyboard to open the menu, then click warp. It should open the warp page, and the first entry on the left side would be 'Words of Yore', this would be the destination pool. Then click on 'warp x10' to perform a 10-warp pull, then click at the blank space at the right-up corner to reveal the arrow at the right-up corner, then click on the arrow to skip the animation. Always click on the arrow to continue skipping the animation if there is an arrow at the right-up corner. Only when all animations are skipped by clicking on the arrows, the pull summary page will appear and there would be a cross there, click on the cross to finish the pull. Good luck!",
+                "initial_state": ".\\assets\\examples\\init_states\\honkai_star_rail.png"
+            }
+        }
+    }
+}

assets/gradio_interface.png ADDED Viewed

Git LFS Details

SHA256: f4eeb66f32c8547e6e6f69dc45ce1f61cf256f7e880ba2a785d49dca67f5fe32
Pointer size: 132 Bytes
Size of remote file: 1.63 MB

assets/ootb_icon.png ADDED Viewed

assets/ootb_logo.png ADDED Viewed

assets/wechat_3.jpg ADDED Viewed

Git LFS Details

SHA256: ac687f4677f7544e093eb13bff39fcd829206165be217b43abcf9830858a1ec7
Pointer size: 131 Bytes
Size of remote file: 140 kB

computer_use_demo/__init__.py ADDED Viewed

File without changes

computer_use_demo/executor/anthropic_executor.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import asyncio
+from typing import Any, Dict, cast
+from collections.abc import Callable
+from anthropic.types.beta import (
+    BetaContentBlock,
+    BetaContentBlockParam,
+    BetaImageBlockParam,
+    BetaMessage,
+    BetaMessageParam,
+    BetaTextBlockParam,
+    BetaToolResultBlockParam,
+)
+from anthropic.types import TextBlock
+from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
+from ..tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
+class AnthropicExecutor:
+    def __init__(
+        self,
+        output_callback: Callable[[BetaContentBlockParam], None],
+        tool_output_callback: Callable[[Any, str], None],
+        selected_screen: int = 0
+    ):
+        self.tool_collection = ToolCollection(
+            ComputerTool(selected_screen=selected_screen),
+            BashTool(),
+            EditTool(),
+        )
+        self.output_callback = output_callback
+        self.tool_output_callback = tool_output_callback
+    def __call__(self, response: BetaMessage, messages: list[BetaMessageParam]):
+        new_message = {
+            "role": "assistant",
+            "content": cast(list[BetaContentBlockParam], response.content),
+        }
+        if new_message not in messages:
+            messages.append(new_message)
+        else:
+            print("new_message already in messages, there are duplicates.")
+        tool_result_content: list[BetaToolResultBlockParam] = []
+        for content_block in cast(list[BetaContentBlock], response.content):
+            self.output_callback(content_block, sender="bot")
+            # Execute the tool
+            if content_block.type == "tool_use":
+                # Run the asynchronous tool execution in a synchronous context
+                result = asyncio.run(self.tool_collection.run(
+                    name=content_block.name,
+                    tool_input=cast(dict[str, Any], content_block.input),
+                ))
+                self.output_callback(result, sender="bot")
+                tool_result_content.append(
+                    _make_api_tool_result(result, content_block.id)
+                )
+                self.tool_output_callback(result, content_block.id)
+            # Craft messages based on the content_block
+            # Note: to display the messages in the gradio, you should organize the messages in the following way (user message, bot message)
+            display_messages = _message_display_callback(messages)
+            # display_messages = []
+            # Send the messages to the gradio
+            for user_msg, bot_msg in display_messages:
+                yield [user_msg, bot_msg], tool_result_content
+        if not tool_result_content:
+            return messages
+        return tool_result_content
+def _message_display_callback(messages):
+    display_messages = []
+    for msg in messages:
+        try:
+            if isinstance(msg["content"][0], TextBlock):
+                display_messages.append((msg["content"][0].text, None))  # User message
+            elif isinstance(msg["content"][0], BetaTextBlock):
+                display_messages.append((None, msg["content"][0].text))  # Bot message
+            elif isinstance(msg["content"][0], BetaToolUseBlock):
+                display_messages.append((None, f"Tool Use: {msg['content'][0].name}\nInput: {msg['content'][0].input}"))  # Bot message
+            elif isinstance(msg["content"][0], Dict) and msg["content"][0]["content"][-1]["type"] == "image":
+                display_messages.append((None, f'<img src="data:image/png;base64,{msg["content"][0]["content"][-1]["source"]["data"]}">'))  # Bot message
+            else:
+                print(msg["content"][0])
+        except Exception as e:
+            print("error", e)
+            pass
+    return display_messages
+def _make_api_tool_result(
+    result: ToolResult, tool_use_id: str
+) -> BetaToolResultBlockParam:
+    """Convert an agent ToolResult to an API ToolResultBlockParam."""
+    tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
+    is_error = False
+    if result.error:
+        is_error = True
+        tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
+    else:
+        if result.output:
+            tool_result_content.append(
+                {
+                    "type": "text",
+                    "text": _maybe_prepend_system_tool_result(result, result.output),
+                }
+            )
+        if result.base64_image:
+            tool_result_content.append(
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": result.base64_image,
+                    },
+                }
+            )
+    return {
+        "type": "tool_result",
+        "content": tool_result_content,
+        "tool_use_id": tool_use_id,
+        "is_error": is_error,
+    }
+def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
+    if result.system:
+        result_text = f"<system>{result.system}</system>\n{result_text}"
+    return result_text

computer_use_demo/executor/showui_executor.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import ast
+import asyncio
+from typing import Any, Dict, cast, List, Union
+from collections.abc import Callable
+import uuid
+from anthropic.types.beta import (
+    BetaContentBlock,
+    BetaContentBlockParam,
+    BetaImageBlockParam,
+    BetaMessage,
+    BetaMessageParam,
+    BetaTextBlockParam,
+    BetaToolResultBlockParam,
+)
+from anthropic.types import TextBlock
+from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
+from computer_use_demo.tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
+from computer_use_demo.tools.colorful_text import colorful_text_showui, colorful_text_vlm
+class ShowUIExecutor:
+    def __init__(
+        self,
+        output_callback: Callable[[BetaContentBlockParam], None],
+        tool_output_callback: Callable[[Any, str], None],
+        selected_screen: int = 0
+    ):
+        self.output_callback = output_callback
+        self.tool_output_callback = tool_output_callback
+        self.selected_screen = selected_screen
+        self.screen_bbox = self._get_screen_resolution()
+        print("Screen BBox:", self.screen_bbox)
+        self.tool_collection = ToolCollection(
+            ComputerTool(selected_screen=selected_screen, is_scaling=False)
+        )
+        self.supported_action_type={
+            # "showui_action": "anthropic_tool_action"
+            "CLICK": 'key',  # TBD
+            "INPUT": "key",
+            "ENTER": "key",  # TBD
+            "ESC": "key",
+            "ESCAPE": "key",
+            "PRESS":  "key",
+        }
+    def __call__(self, response: str, messages: list[BetaMessageParam]):
+        # response is expected to be :
+        # {'content': "{'action': 'CLICK', 'value': None, 'position': [0.83, 0.15]}, ...", 'role': 'assistant'},
+        action_dict = self._format_actor_output(response)  # str -> dict
+        actions = action_dict["content"]
+        role = action_dict["role"]
+        # Parse the actions from showui
+        action_list = self._parse_showui_output(actions)
+        print("Parsed Action List:", action_list)
+        tool_result_content = None
+        if action_list is not None and len(action_list) > 0:
+            for action in action_list:  # Execute the tool (adapting the code from anthropic_executor.py)
+                tool_result_content: list[BetaToolResultBlockParam] = []
+                self.output_callback(f"{colorful_text_showui}:\n{action}", sender="bot")
+                print("Converted Action:", action)
+                sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
+                                        input={'action': action["action"], 'text': action["text"], 'coordinate': action["coordinate"]},
+                                        name='computer', type='tool_use')
+                # update messages
+                new_message = {
+                    "role": "assistant",
+                    "content": cast(list[BetaContentBlockParam], [sim_content_block]),
+                }
+                if new_message not in messages:
+                    messages.append(new_message)
+                # Run the asynchronous tool execution in a synchronous context
+                result = self.tool_collection.sync_call(
+                    name=sim_content_block.name,
+                    tool_input=cast(dict[str, Any], sim_content_block.input),
+                )
+                tool_result_content.append(
+                    _make_api_tool_result(result, sim_content_block.id)
+                )
+                # print(f"executor: tool_result_content: {tool_result_content}")
+                self.tool_output_callback(result, sim_content_block.id)
+                # Craft messages based on the content_block
+                # Note: to display the messages in the gradio, you should organize the messages in the following way (user message, bot message)
+                display_messages = _message_display_callback(messages)
+                # Send the messages to the gradio
+                for user_msg, bot_msg in display_messages:
+                    yield [user_msg, bot_msg], tool_result_content
+        return tool_result_content
+    def _format_actor_output(self, action_output: str|dict) -> Dict[str, Any]:
+        if type(action_output) == dict:
+            return action_output
+        else:
+            try:
+                action_output.replace("'", "\"")
+                action_dict = ast.literal_eval(action_output)
+                return action_dict
+            except Exception as e:
+                print(f"Error parsing action output: {e}")
+                return None
+    def _parse_showui_output(self, output_text: str) -> Union[List[Dict[str, Any]], None]:
+        try:
+            output_text = output_text.strip()
+            # process single dictionary
+            if output_text.startswith("{") and output_text.endswith("}"):
+                output_text = f"[{output_text}]"
+            # Validate if the output resembles a list of dictionaries
+            if not (output_text.startswith("[") and output_text.endswith("]")):
+                raise ValueError("Output does not look like a valid list or dictionary.")
+            print("Output Text:", output_text)
+            parsed_output = ast.literal_eval(output_text)
+            print("Parsed Output:", parsed_output)
+            if isinstance(parsed_output, dict):
+                parsed_output = [parsed_output]
+            elif not isinstance(parsed_output, list):
+                raise ValueError("Parsed output is neither a dictionary nor a list.")
+            if not all(isinstance(item, dict) for item in parsed_output):
+                raise ValueError("Not all items in the parsed output are dictionaries.")
+            # refine key: value pairs, mapping to the Anthropic's format
+            refined_output = []
+            for action_item in parsed_output:
+                print("Action Item:", action_item)
+                # sometime showui returns lower case action names
+                action_item["action"] = action_item["action"].upper()
+                if action_item["action"] not in self.supported_action_type:
+                    raise ValueError(f"Action {action_item['action']} not supported. Check the output from ShowUI: {output_text}")
+                    # continue
+                elif action_item["action"] == "CLICK":  # 1. click -> mouse_move + left_click
+                    x, y = action_item["position"]
+                    action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
+                                               int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
+                    refined_output.append({"action": "mouse_move", "text": None, "coordinate": tuple(action_item["position"])})
+                    refined_output.append({"action": "left_click", "text": None, "coordinate": None})
+                elif action_item["action"] == "INPUT":  # 2. input -> type
+                    refined_output.append({"action": "type", "text": action_item["value"], "coordinate": None})
+                elif action_item["action"] == "ENTER":  # 3. enter -> key, enter
+                    refined_output.append({"action": "key", "text": "Enter", "coordinate": None})
+                elif action_item["action"] == "ESC" or action_item["action"] == "ESCAPE":  # 4. enter -> key, enter
+                    refined_output.append({"action": "key", "text": "Escape", "coordinate": None})
+                elif action_item["action"] == "HOVER":  # 5. hover -> mouse_move
+                    x, y = action_item["position"]
+                    action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
+                                               int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
+                    refined_output.append({"action": "mouse_move", "text": None, "coordinate": tuple(action_item["position"])})
+                elif action_item["action"] == "SCROLL":  # 6. scroll -> key: pagedown
+                    if action_item["value"] == "up":
+                        refined_output.append({"action": "key", "text": "pageup", "coordinate": None})
+                    elif action_item["value"] == "down":
+                        refined_output.append({"action": "key", "text": "pagedown", "coordinate": None})
+                    else:
+                        raise ValueError(f"Scroll direction {action_item['value']} not supported.")
+                elif action_item["action"] == "PRESS":  # 7. press
+                    x, y = action_item["position"]
+                    action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
+                                               int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
+                    refined_output.append({"action": "mouse_move", "text": None, "coordinate": tuple(action_item["position"])})
+                    refined_output.append({"action": "left_press", "text": None, "coordinate": None})
+            return refined_output
+        except Exception as e:
+            print(f"Error parsing output: {e}")
+            return None
+    def _get_screen_resolution(self):
+        from screeninfo import get_monitors
+        import platform
+        if platform.system() == "Darwin":
+            import Quartz  # uncomment this line if you are on macOS
+        import subprocess
+        # Detect platform
+        system = platform.system()
+        if system == "Windows":
+            # Windows: Use screeninfo to get monitor details
+            screens = get_monitors()
+            # Sort screens by x position to arrange from left to right
+            sorted_screens = sorted(screens, key=lambda s: s.x)
+            if self.selected_screen < 0 or self.selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            screen = sorted_screens[self.selected_screen]
+            bbox = (screen.x, screen.y, screen.x + screen.width, screen.y + screen.height)
+        elif system == "Darwin":  # macOS
+            # macOS: Use Quartz to get monitor details
+            max_displays = 32  # Maximum number of displays to handle
+            active_displays = Quartz.CGGetActiveDisplayList(max_displays, None, None)[1]
+            # Get the display bounds (resolution) for each active display
+            screens = []
+            for display_id in active_displays:
+                bounds = Quartz.CGDisplayBounds(display_id)
+                screens.append({
+                    'id': display_id,
+                    'x': int(bounds.origin.x),
+                    'y': int(bounds.origin.y),
+                    'width': int(bounds.size.width),
+                    'height': int(bounds.size.height),
+                    'is_primary': Quartz.CGDisplayIsMain(display_id)  # Check if this is the primary display
+                })
+            # Sort screens by x position to arrange from left to right
+            sorted_screens = sorted(screens, key=lambda s: s['x'])
+            if self.selected_screen < 0 or self.selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            screen = sorted_screens[self.selected_screen]
+            bbox = (screen['x'], screen['y'], screen['x'] + screen['width'], screen['y'] + screen['height'])
+        else:  # Linux or other OS
+            cmd = "xrandr | grep ' primary' | awk '{print $4}'"
+            try:
+                output = subprocess.check_output(cmd, shell=True).decode()
+                resolution = output.strip().split()[0]
+                width, height = map(int, resolution.split('x'))
+                bbox = (0, 0, width, height)  # Assuming single primary screen for simplicity
+            except subprocess.CalledProcessError:
+                raise RuntimeError("Failed to get screen resolution on Linux.")
+        return bbox
+def _message_display_callback(messages):
+    display_messages = []
+    for msg in messages:
+        try:
+            if isinstance(msg["content"][0], TextBlock):
+                display_messages.append((msg["content"][0].text, None))  # User message
+            elif isinstance(msg["content"][0], BetaTextBlock):
+                display_messages.append((None, msg["content"][0].text))  # Bot message
+            elif isinstance(msg["content"][0], BetaToolUseBlock):
+                display_messages.append((None, f"Tool Use: {msg['content'][0].name}\nInput: {msg['content'][0].input}"))  # Bot message
+            elif isinstance(msg["content"][0], Dict) and msg["content"][0]["content"][-1]["type"] == "image":
+                display_messages.append((None, f'<img src="data:image/png;base64,{msg["content"][0]["content"][-1]["source"]["data"]}">'))  # Bot message
+            else:
+                pass
+                # print(msg["content"][0])
+        except Exception as e:
+            print("error", e)
+            pass
+    return display_messages
+def _make_api_tool_result(
+    result: ToolResult, tool_use_id: str
+) -> BetaToolResultBlockParam:
+    """Convert an agent ToolResult to an API ToolResultBlockParam."""
+    tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
+    is_error = False
+    if result.error:
+        is_error = True
+        tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
+    else:
+        if result.output:
+            tool_result_content.append(
+                {
+                    "type": "text",
+                    "text": _maybe_prepend_system_tool_result(result, result.output),
+                }
+            )
+        if result.base64_image:
+            tool_result_content.append(
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": result.base64_image,
+                    },
+                }
+            )
+    return {
+        "type": "tool_result",
+        "content": tool_result_content,
+        "tool_use_id": tool_use_id,
+        "is_error": is_error,
+    }
+def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
+    if result.system:
+        result_text = f"<system>{result.system}</system>\n{result_text}"
+    return result_text
+# Testing main function
+if __name__ == "__main__":
+    def output_callback(content_block):
+        # print("Output Callback:", content_block)
+        pass
+    def tool_output_callback(result, action):
+        print("[showui_executor] Tool Output Callback:", result, action)
+        pass
+    # Instantiate the executor
+    executor = ShowUIExecutor(
+        output_callback=output_callback,
+        tool_output_callback=tool_output_callback,
+        selected_screen=0
+    )
+    # test inputs
+    response_content = "{'content': \"{'action': 'CLICK', 'value': None, 'position': [0.49, 0.18]}\", 'role': 'assistant'}"
+    # response_content = {'content': "{'action': 'CLICK', 'value': None, 'position': [0.49, 0.39]}", 'role': 'assistant'}
+    # response_content = "{'content': \"{'action': 'CLICK', 'value': None, 'position': [0.49, 0.42]}, {'action': 'INPUT', 'value': 'weather for New York city', 'position': [0.49, 0.42]}, {'action': 'ENTER', 'value': None, 'position': None}\", 'role': 'assistant'}"
+    # Initialize messages
+    messages = []
+    # Call the executor
+    print("Testing ShowUIExecutor with response content:", response_content)
+    for message, tool_result_content in executor(response_content, messages):
+        print("Message:", message)
+        print("Tool Result Content:", tool_result_content)
+    # Display final messages
+    print("\nFinal messages:")
+    for msg in messages:
+        print(msg)
+[
+    {'role': 'user', 'content': ['open a new tab and go to amazon.com', 'tmp/outputs/screenshot_b4a1b7e60a5c47359bedbd8707573966.png']},
+    {'role': 'assistant', 'content': ["History Action: {'action': 'mouse_move', 'text': None, 'coordinate': (1216, 88)}"]},
+    {'role': 'assistant', 'content': ["History Action: {'action': 'left_click', 'text': None, 'coordinate': None}"]},
+    {'content': [
+        {'type': 'tool_result', 'content': [{'type': 'text', 'text': 'Moved mouse to (1216, 88)'}], 'tool_use_id': 'toolu_ae4f2886-366c-4789-9fa6-ec13461cef12', 'is_error': False},
+        {'type': 'tool_result', 'content': [{'type': 'text', 'text': 'Performed left_click'}], 'tool_use_id': 'toolu_a7377954-e1b7-4746-9757-b2eb4dcddc82', 'is_error': False}
+                ], 'role': 'user'}
+]

computer_use_demo/gui_agent/actor/showui_agent.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+import ast
+import base64
+from io import BytesIO
+from pathlib import Path
+from uuid import uuid4
+import pyautogui
+import requests
+import torch
+from PIL import Image, ImageDraw
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from computer_use_demo.gui_agent.llm_utils.oai import encode_image
+from computer_use_demo.tools.colorful_text import colorful_text_showui, colorful_text_vlm
+from computer_use_demo.tools.screen_capture import get_screenshot
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class ShowUIActor:
+    _NAV_SYSTEM = """
+    You are an assistant trained to navigate the {_APP} screen.
+    Given a task instruction, a screen observation, and an action history sequence,
+    output the next action and wait for the next observation.
+    Here is the action space:
+    {_ACTION_SPACE}
+    """
+    _NAV_FORMAT = """
+    Format the action as a dictionary with the following keys:
+    {'action': 'ACTION_TYPE', 'value': 'element', 'position': [x,y]}
+    If value or position is not applicable, set it as None.
+    Position might be [[x1,y1], [x2,y2]] if the action requires a start and end position.
+    Position represents the relative coordinates on the screenshot and should be scaled to a range of 0-1.
+    """
+    action_map = {
+    'desktop': """
+        1. CLICK: Click on an element, value is not applicable and the position [x,y] is required.
+        2. INPUT: Type a string into an element, value is a string to type and the position [x,y] is required.
+        3. HOVER: Hover on an element, value is not applicable and the position [x,y] is required.
+        4. ENTER: Enter operation, value and position are not applicable.
+        5. SCROLL: Scroll the screen, value is the direction to scroll and the position is not applicable.
+        6. ESC: ESCAPE operation, value and position are not applicable.
+        7. PRESS: Long click on an element, value is not applicable and the position [x,y] is required.
+        """,
+    'phone': """
+        1. INPUT: Type a string into an element, value is not applicable and the position [x,y] is required.
+        2. SWIPE: Swipe the screen, value is not applicable and the position [[x1,y1], [x2,y2]] is the start and end position of the swipe operation.
+        3. TAP: Tap on an element, value is not applicable and the position [x,y] is required.
+        4. ANSWER: Answer the question, value is the status (e.g., 'task complete') and the position is not applicable.
+        5. ENTER: Enter operation, value and position are not applicable.
+        """
+    }
+    def __init__(self, model_path, output_callback, device=torch.device("cpu"), split='desktop', selected_screen=0,
+                 max_pixels=1344, awq_4bit=False):
+        self.device = device
+        self.split = split
+        self.selected_screen = selected_screen
+        self.output_callback = output_callback
+        if not model_path or not os.path.exists(model_path) or not os.listdir(model_path):
+            if awq_4bit:
+                model_path = "showlab/ShowUI-2B-AWQ-4bit"
+            else:
+                model_path = "showlab/ShowUI-2B"
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            device_map="cpu"
+        ).to(self.device)
+        self.model.eval()
+        self.min_pixels = 256 * 28 * 28
+        self.max_pixels = max_pixels * 28 * 28
+        # self.max_pixels = 1344 * 28 * 28
+        self.processor = AutoProcessor.from_pretrained(
+            "Qwen/Qwen2-VL-2B-Instruct",
+            # "./Qwen2-VL-2B-Instruct",
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels
+        )
+        self.system_prompt = self._NAV_SYSTEM.format(
+            _APP=split,
+            _ACTION_SPACE=self.action_map[split]
+        )
+        self.action_history = ''  # Initialize action history
+    def __call__(self, messages):
+        task = messages
+        # screenshot
+        screenshot, screenshot_path = get_screenshot(selected_screen=self.selected_screen, resize=True, target_width=1920, target_height=1080)
+        screenshot_path = str(screenshot_path)
+        image_base64 = encode_image(screenshot_path)
+        self.output_callback(f'Screenshot for {colorful_text_showui}:\n<img src="data:image/png;base64,{image_base64}">', sender="bot")
+        # Use system prompt, task, and action history to build the messages
+        messages_for_processor = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": self.system_prompt},
+                    {"type": "image", "image": screenshot_path, "min_pixels": self.min_pixels, "max_pixels": self.max_pixels},
+                    {"type": "text", "text": f"Task: {task}"}
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages_for_processor, tokenize=False, add_generation_prompt=True,
+        )
+        image_inputs, video_inputs = process_vision_info(messages_for_processor)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.device)
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        # Update action history
+        self.action_history += output_text + '\n'
+        # Return response in expected format
+        response = {'content': output_text, 'role': 'assistant'}
+        return response
+    def parse_showui_output(self, output_text):
+        try:
+            # Ensure the output is stripped of any extra spaces
+            output_text = output_text.strip()
+            # Wrap the input in brackets if it looks like a single dictionary
+            if output_text.startswith("{") and output_text.endswith("}"):
+                output_text = f"[{output_text}]"
+            # Validate if the output resembles a list of dictionaries
+            if not (output_text.startswith("[") and output_text.endswith("]")):
+                raise ValueError("Output does not look like a valid list or dictionary.")
+            # Parse the output using ast.literal_eval
+            parsed_output = ast.literal_eval(output_text)
+            # Ensure the result is a list
+            if isinstance(parsed_output, dict):
+                parsed_output = [parsed_output]
+            elif not isinstance(parsed_output, list):
+                raise ValueError("Parsed output is neither a dictionary nor a list.")
+            # Ensure all elements in the list are dictionaries
+            if not all(isinstance(item, dict) for item in parsed_output):
+                raise ValueError("Not all items in the parsed output are dictionaries.")
+            return parsed_output
+        except Exception as e:
+            print(f"Error parsing output: {e}")
+            return None

computer_use_demo/gui_agent/actor/uitars_agent.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import json
+import re
+from openai import OpenAI
+from computer_use_demo.gui_agent.llm_utils.oai import encode_image
+from computer_use_demo.tools.screen_capture import get_screenshot
+from computer_use_demo.tools.logger import logger, truncate_string
+class UITARS_Actor:
+    """
+    In OOTB, we use the default grounding system prompt form UI_TARS repo, and then convert its action to our action format.
+    """
+    _NAV_SYSTEM_GROUNDING = """
+You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+## Output Format
+```Action: ...```
+## Action Space
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+hotkey(key='')
+type(content='') #If you want to submit your input, use \"\" at the end of `content`.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished()
+call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
+## Note
+- Do not generate any other text.
+"""
+    def __init__(self, ui_tars_url, output_callback, api_key="", selected_screen=0):
+        self.ui_tars_url = ui_tars_url
+        self.ui_tars_client = OpenAI(base_url=self.ui_tars_url, api_key=api_key)
+        self.selected_screen = selected_screen
+        self.output_callback = output_callback
+        self.grounding_system_prompt = self._NAV_SYSTEM_GROUNDING.format()
+    def __call__(self, messages):
+        task = messages
+        # take screenshot
+        screenshot, screenshot_path = get_screenshot(selected_screen=self.selected_screen, resize=True, target_width=1920, target_height=1080)
+        screenshot_path = str(screenshot_path)
+        screenshot_base64 = encode_image(screenshot_path)
+        logger.info(f"Sending messages to UI-TARS on {self.ui_tars_url}: {task}, screenshot: {screenshot_path}")
+        response = self.ui_tars_client.chat.completions.create(
+            model="ui-tars",
+            messages=[
+                {"role": "system", "content": self.grounding_system_prompt},
+                {"role": "user", "content": [
+                    {"type": "text", "text": task},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_base64}"}}
+                    ]
+                },
+                ],
+            max_tokens=256,
+            temperature=0
+            )
+        ui_tars_action = response.choices[0].message.content
+        converted_action = convert_ui_tars_action_to_json(ui_tars_action)
+        response = str(converted_action)
+        response = {'content': response, 'role': 'assistant'}
+        return response
+def convert_ui_tars_action_to_json(action_str: str) -> str:
+    """
+    Converts an action line such as:
+      Action: click(start_box='(153,97)')
+    into a JSON string of the form:
+      {
+        "action": "CLICK",
+        "value": null,
+        "position": [153, 97]
+      }
+    """
+    # Strip leading/trailing whitespace and remove "Action: " prefix if present
+    action_str = action_str.strip()
+    if action_str.startswith("Action:"):
+        action_str = action_str[len("Action:"):].strip()
+    # Mappings from old action names to the new action schema
+    ACTION_MAP = {
+        "click": "CLICK",
+        "type": "INPUT",
+        "scroll": "SCROLL",
+        "wait": "STOP",        # TODO: deal with "wait()"
+        "finished": "STOP",
+        "call_user": "STOP",
+        "hotkey": "HOTKEY",    # We break down the actual key below (Enter, Esc, etc.)
+    }
+    # Prepare a structure for the final JSON
+    # Default to no position and null value
+    output_dict = {
+        "action": None,
+        "value": None,
+        "position": None
+    }
+    # 1) CLICK(...) e.g. click(start_box='(153,97)')
+    match_click = re.match(r"^click\(start_box='\(?(\d+),\s*(\d+)\)?'\)$", action_str)
+    if match_click:
+        x, y = match_click.groups()
+        output_dict["action"] = ACTION_MAP["click"]
+        output_dict["position"] = [int(x), int(y)]
+        return json.dumps(output_dict)
+    # 2) HOTKEY(...) e.g. hotkey(key='Enter')
+    match_hotkey = re.match(r"^hotkey\(key='([^']+)'\)$", action_str)
+    if match_hotkey:
+        key = match_hotkey.group(1).lower()
+        if key == "enter":
+            output_dict["action"] = "ENTER"
+        elif key == "esc":
+            output_dict["action"] = "ESC"
+        else:
+            # Otherwise treat it as some generic hotkey
+            output_dict["action"] = ACTION_MAP["hotkey"]
+            output_dict["value"] = key
+        return json.dumps(output_dict)
+    # 3) TYPE(...) e.g. type(content='some text')
+    match_type = re.match(r"^type\(content='([^']*)'\)$", action_str)
+    if match_type:
+        typed_content = match_type.group(1)
+        output_dict["action"] = ACTION_MAP["type"]
+        output_dict["value"] = typed_content
+        # If you want a position (x,y) you need it in your string. Otherwise it's omitted.
+        return json.dumps(output_dict)
+    # 4) SCROLL(...) e.g. scroll(start_box='(153,97)', direction='down')
+    #    or scroll(start_box='...', direction='down')
+    match_scroll = re.match(
+        r"^scroll\(start_box='[^']*'\s*,\s*direction='(down|up|left|right)'\)$",
+        action_str
+    )
+    if match_scroll:
+        direction = match_scroll.group(1)
+        output_dict["action"] = ACTION_MAP["scroll"]
+        output_dict["value"] = direction
+        return json.dumps(output_dict)
+    # 5) WAIT() or FINISHED() or CALL_USER() etc.
+    if action_str in ["wait()", "finished()", "call_user()"]:
+        base_action = action_str.replace("()", "")
+        if base_action in ACTION_MAP:
+            output_dict["action"] = ACTION_MAP[base_action]
+        else:
+            output_dict["action"] = "STOP"
+        return json.dumps(output_dict)
+    # If none of the above patterns matched, you can decide how to handle
+    # unknown or unexpected action lines:
+    output_dict["action"] = "STOP"
+    return json.dumps(output_dict)

computer_use_demo/gui_agent/llm_utils/llm_utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import re
+import ast
+import base64
+def is_image_path(text):
+    # Checking if the input text ends with typical image file extensions
+    image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
+    if text.endswith(image_extensions):
+        return True
+    else:
+        return False
+def encode_image(image_path):
+    """Encode image file to base64."""
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def is_url_or_filepath(input_string):
+    # Check if input_string is a URL
+    url_pattern = re.compile(
+        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
+    )
+    if url_pattern.match(input_string):
+        return "URL"
+    # Check if input_string is a file path
+    file_path = os.path.abspath(input_string)
+    if os.path.exists(file_path):
+        return "File path"
+    return "Invalid"
+def extract_data(input_string, data_type):
+    # Regular expression to extract content starting from '```python' until the end if there are no closing backticks
+    pattern = f"```{data_type}" + r"(.*?)(```|$)"
+    # Extract content
+    # re.DOTALL allows '.' to match newlines as well
+    matches = re.findall(pattern, input_string, re.DOTALL)
+    # Return the first match if exists, trimming whitespace and ignoring potential closing backticks
+    return matches[0][0].strip() if matches else input_string
+def parse_input(code):
+    """Use AST to parse the input string and extract the function name, arguments, and keyword arguments."""
+    def get_target_names(target):
+        """Recursively get all variable names from the assignment target."""
+        if isinstance(target, ast.Name):
+            return [target.id]
+        elif isinstance(target, ast.Tuple):
+            names = []
+            for elt in target.elts:
+                names.extend(get_target_names(elt))
+            return names
+        return []
+    def extract_value(node):
+        """提取 AST 节点的实际值"""
+        if isinstance(node, ast.Constant):
+            return node.value
+        elif isinstance(node, ast.Name):
+            # TODO: a better way to handle variables
+            raise ValueError(
+                f"Arguments should be a Constant, got a variable {node.id} instead."
+            )
+        # 添加其他需要处理的 AST 节点类型
+        return None
+    try:
+        tree = ast.parse(code)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Assign):
+                targets = []
+                for t in node.targets:
+                    targets.extend(get_target_names(t))
+                if isinstance(node.value, ast.Call):
+                    func_name = node.value.func.id
+                    args = [ast.dump(arg) for arg in node.value.args]
+                    kwargs = {
+                        kw.arg: extract_value(kw.value) for kw in node.value.keywords
+                    }
+                    print(f"Input: {code.strip()}")
+                    print(f"Output Variables: {targets}")
+                    print(f"Function Name: {func_name}")
+                    print(f"Arguments: {args}")
+                    print(f"Keyword Arguments: {kwargs}")
+            elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
+                targets = []
+                func_name = extract_value(node.value.func)
+                args = [extract_value(arg) for arg in node.value.args]
+                kwargs = {kw.arg: extract_value(kw.value) for kw in node.value.keywords}
+    except SyntaxError:
+        print(f"Input: {code.strip()}")
+        print("No match found")
+    return targets, func_name, args, kwargs
+if __name__ == "__main__":
+    import json
+    s='{"Thinking": "The Docker icon has been successfully clicked, and the Docker application should now be opening. No further actions are required.", "Next Action": None}'
+    json_str = json.loads(s)
+    print(json_str)

computer_use_demo/gui_agent/llm_utils/oai.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import os
+import logging
+import base64
+import requests
+from computer_use_demo.gui_agent.llm_utils.llm_utils import is_image_path, encode_image
+def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
+    api_key = api_key or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OPENAI_API_KEY is not set")
+    headers = {"Content-Type": "application/json",
+               "Authorization": f"Bearer {api_key}"}
+    final_messages = [{"role": "system", "content": system}]
+    # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    if type(messages) == list:
+        for item in messages:
+            contents = []
+            if isinstance(item, dict):
+                for cnt in item["content"]:
+                    if isinstance(cnt, str):
+                        if is_image_path(cnt):
+                            base64_image = encode_image(cnt)
+                            content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+                        # content = {"type": "image_url", "image_url": {"url": image_url}}
+                        else:
+                            content = {"type": "text", "text": cnt}
+                    contents.append(content)
+                message = {"role": item["role"], "content": contents}
+            else:  # str
+                contents.append({"type": "text", "text": item})
+                message = {"role": "user", "content": contents}
+            final_messages.append(message)
+    elif isinstance(messages, str):
+        final_messages = [{"role": "user", "content": messages}]
+    print("[oai] sending messages:", final_messages)
+    payload = {
+        "model": llm,
+        "messages": final_messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        # "stop": stop,
+    }
+    # from IPython.core.debugger import Pdb; Pdb().set_trace()
+    response = requests.post(
+        "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
+    )
+    try:
+        text = response.json()['choices'][0]['message']['content']
+        token_usage = int(response.json()['usage']['total_tokens'])
+        return text, token_usage
+    # return error message if the response is not successful
+    except Exception as e:
+        print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
+        return response.json()
+def run_ssh_llm_interleaved(messages: list, system: str, llm: str, ssh_host: str, ssh_port: int, max_tokens=256, temperature=0.7, do_sample=True):
+    """Send chat completion request to SSH remote server"""
+    from PIL import Image
+    from io import BytesIO
+    def encode_image(image_path: str, max_size=1024) -> str:
+        """Convert image to base64 encoding with preprocessing"""
+        try:
+            with Image.open(image_path) as img:
+                # Convert to RGB format
+                img = img.convert('RGB')
+                # Scale down if image is too large
+                if max(img.size) > max_size:
+                    ratio = max_size / max(img.size)
+                    new_size = tuple(int(dim * ratio) for dim in img.size)
+                    img = img.resize(new_size, Image.LANCZOS)
+                # Convert processed image to base64
+                buffered = BytesIO()
+                img.save(buffered, format="JPEG", quality=85)
+                img_str = base64.b64encode(buffered.getvalue()).decode()
+                return img_str
+        except Exception as e:
+            print(f"Image processing failed: {str(e)}")
+            raise
+    try:
+        # Verify SSH connection info
+        if not ssh_host or not ssh_port:
+            raise ValueError("SSH_HOST and SSH_PORT are not set")
+        # Build API URL
+        api_url = f"http://{ssh_host}:{ssh_port}"
+        # Prepare message list
+        final_messages = []
+        # Add system message
+        if system:
+            final_messages.append({
+                "role": "system",
+                "content": system
+            })
+        # Process user messages
+        if type(messages) == list:
+            for item in messages:
+                contents = []
+                if isinstance(item, dict):
+                    for cnt in item["content"]:
+                        if isinstance(cnt, str):
+                            if is_image_path(cnt):
+                                base64_image = encode_image(cnt)
+                                content = {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": f"data:image/jpeg;base64,{base64_image}"
+                                    }
+                                }
+                            else:
+                                content = {
+                                    "type": "text",
+                                    "text": cnt
+                                }
+                        contents.append(content)
+                    message = {"role": item["role"], "content": contents}
+                else:  # str
+                    contents.append({"type": "text", "text": item})
+                    message = {"role": "user", "content": contents}
+                final_messages.append(message)
+        elif isinstance(messages, str):
+            final_messages.append({
+                "role": "user",
+                "content": messages
+            })
+        # Prepare request data
+        data = {
+            "model": llm,
+            "messages": final_messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+            "do_sample": do_sample
+        }
+        print(f"[ssh] Sending chat completion request to model: {llm}")
+        print(f"[ssh] sending messages:", final_messages)
+        # Send request
+        response = requests.post(
+            f"{api_url}/v1/chat/completions",
+            json=data,
+            headers={"Content-Type": "application/json"},
+            timeout=30
+        )
+        result = response.json()
+        if response.status_code == 200:
+            content = result['choices'][0]['message']['content']
+            token_usage = int(result['usage']['total_tokens'])
+            print(f"[ssh] Generation successful: {content}")
+            return content, token_usage
+        else:
+            print(f"[ssh] Request failed: {result}")
+            raise Exception(f"API request failed: {result}")
+    except Exception as e:
+        print(f"[ssh] Chat completion request failed: {str(e)}")
+        raise
+if __name__ == "__main__":
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OPENAI_API_KEY is not set")
+    # text, token_usage = run_oai_interleaved(
+    #     messages= [{"content": [
+    #                     "What is in the screenshot?",
+    #                     "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"],
+    #                 "role": "user"
+    #                 }],
+    #     llm="gpt-4o-mini",
+    #     system="You are a helpful assistant",
+    #     api_key=api_key,
+    #     max_tokens=256,
+    #     temperature=0)
+    # print(text, token_usage)
+    text, token_usage = run_ssh_llm_interleaved(
+        messages= [{"content": [
+                        "What is in the screenshot?",
+                        "tmp/outputs/screenshot_5a26d36c59e84272ab58c1b34493d40d.png"],
+                    "role": "user"
+                    }],
+        llm="Qwen2.5-VL-7B-Instruct",
+        ssh_host="10.245.92.68",
+        ssh_port=9192,
+        max_tokens=256,
+        temperature=0.7
+    )
+    print(text, token_usage)
+    # There is an introduction describing the Calyx... 36986

computer_use_demo/gui_agent/llm_utils/qwen.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import logging
+import base64
+import requests
+import dashscope
+# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
+def is_image_path(text):
+    return False
+def encode_image(image_path):
+    return ""
+def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
+    api_key = api_key or os.environ.get("QWEN_API_KEY")
+    if not api_key:
+        raise ValueError("QWEN_API_KEY is not set")
+    dashscope.api_key = api_key
+    # from IPython.core.debugger import Pdb; Pdb().set_trace()
+    final_messages = [{"role": "system", "content": [{"text": system}]}]
+    # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    if type(messages) == list:
+        for item in messages:
+            contents = []
+            if isinstance(item, dict):
+                for cnt in item["content"]:
+                    if isinstance(cnt, str):
+                        if is_image_path(cnt):
+                            # base64_image = encode_image(cnt)
+                            content = [{"image": cnt}]
+                        # content = {"type": "image_url", "image_url": {"url": image_url}}
+                    else:
+                        content = {"text": cnt}
+                    contents.append(content)
+                message = {"role": item["role"], "content": contents}
+            else:  # str
+                contents.append({"text": item})
+                message = {"role": "user", "content": contents}
+            final_messages.append(message)
+    print("[qwen-vl] sending messages:", final_messages)
+    response = dashscope.MultiModalConversation.call(
+        model='qwen-vl-max-latest',
+        # model='qwen-vl-max-0809',
+        messages=final_messages
+        )
+    # from IPython.core.debugger import Pdb; Pdb().set_trace()
+    try:
+        text = response.output.choices[0].message.content[0]['text']
+        usage = response.usage
+        if "total_tokens" not in usage:
+            token_usage = int(usage["input_tokens"] + usage["output_tokens"])
+        else:
+            token_usage = int(usage["total_tokens"])
+        return text, token_usage
+        # return response.json()['choices'][0]['message']['content']
+    # return error message if the response is not successful
+    except Exception as e:
+        print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
+        return response.json()
+if __name__ == "__main__":
+    api_key = os.environ.get("QWEN_API_KEY")
+    if not api_key:
+        raise ValueError("QWEN_API_KEY is not set")
+    dashscope.api_key = api_key
+    final_messages = [{"role": "user",
+                       "content": [
+                           {"text": "What is in the screenshot?"},
+                           {"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
+                           ]
+                       }
+                    ]
+    response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
+    print(response)
+    text = response.output.choices[0].message.content[0]['text']
+    usage = response.usage
+    if "total_tokens" not in usage:
+        if "image_tokens" in usage:
+            token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
+        else:
+            token_usage = usage["input_tokens"] + usage["output_tokens"]
+    else:
+        token_usage = usage["total_tokens"]
+    print(text, token_usage)
+    # The screenshot is from a video game... 1387

computer_use_demo/gui_agent/llm_utils/run_llm.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import base64
+import logging
+from .oai import run_oai_interleaved
+from .gemini import run_gemini_interleaved
+def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None):
+    log_prompt(prompt)
+    # turn string prompt into list
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    elif isinstance(prompt, list):
+        pass
+    else:
+        raise ValueError(f"Invalid prompt type: {type(prompt)}")
+    if llm.startswith("gpt"): # gpt series
+        out = run_oai_interleaved(
+            prompt,
+            llm,
+            max_tokens,
+            temperature,
+            stop
+        )
+    elif llm.startswith("gemini"): # gemini series
+        out = run_gemini_interleaved(
+            prompt,
+            llm,
+            max_tokens,
+            temperature,
+            stop
+        )
+    else:
+        raise ValueError(f"Invalid llm: {llm}")
+    logging.info(
+        f"========Output for {llm}=======\n{out}\n============================")
+    return out
+def log_prompt(prompt):
+    prompt_display = [prompt] if isinstance(prompt, str) else prompt
+    prompt_display = "\n\n".join(prompt_display)
+    logging.info(
+        f"========Prompt=======\n{prompt_display}\n============================")

computer_use_demo/gui_agent/planner/anthropic_agent.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Agentic sampling loop that calls the Anthropic API and local implementation of anthropic-defined computer use tools.
+"""
+import asyncio
+import platform
+from collections.abc import Callable
+from datetime import datetime
+from enum import StrEnum
+from typing import Any, cast
+from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
+from anthropic.types import (
+    ToolResultBlockParam,
+)
+from anthropic.types.beta import (
+    BetaContentBlock,
+    BetaContentBlockParam,
+    BetaImageBlockParam,
+    BetaMessage,
+    BetaMessageParam,
+    BetaTextBlockParam,
+    BetaToolResultBlockParam,
+)
+from anthropic.types import TextBlock
+from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
+from computer_use_demo.tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
+from PIL import Image
+from io import BytesIO
+import gradio as gr
+from typing import Dict
+BETA_FLAG = "computer-use-2024-10-22"
+class APIProvider(StrEnum):
+    ANTHROPIC = "anthropic"
+    BEDROCK = "bedrock"
+    VERTEX = "vertex"
+PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
+    APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
+    APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
+    APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
+}
+# Check OS
+SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
+* You are utilizing a Windows system with internet access.
+* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
+</SYSTEM_CAPABILITY>
+"""
+class AnthropicActor:
+    def __init__(
+        self,
+        model: str,
+        provider: APIProvider,
+        system_prompt_suffix: str,
+        api_key: str,
+        api_response_callback: Callable[[APIResponse[BetaMessage]], None],
+        max_tokens: int = 4096,
+        only_n_most_recent_images: int | None = None,
+        selected_screen: int = 0,
+        print_usage: bool = True,
+    ):
+        self.model = model
+        self.provider = provider
+        self.system_prompt_suffix = system_prompt_suffix
+        self.api_key = api_key
+        self.api_response_callback = api_response_callback
+        self.max_tokens = max_tokens
+        self.only_n_most_recent_images = only_n_most_recent_images
+        self.selected_screen = selected_screen
+        self.tool_collection = ToolCollection(
+            ComputerTool(selected_screen=selected_screen),
+            BashTool(),
+            EditTool(),
+        )
+        self.system = (
+            f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}"
+        )
+        self.total_token_usage = 0
+        self.total_cost = 0
+        self.print_usage = print_usage
+        # Instantiate the appropriate API client based on the provider
+        if provider == APIProvider.ANTHROPIC:
+            self.client = Anthropic(api_key=api_key)
+        elif provider == APIProvider.VERTEX:
+            self.client = AnthropicVertex()
+        elif provider == APIProvider.BEDROCK:
+            self.client = AnthropicBedrock()
+    def __call__(
+        self,
+        *,
+        messages: list[BetaMessageParam]
+    ):
+        """
+        Generate a response given history messages.
+        """
+        if self.only_n_most_recent_images:
+            _maybe_filter_to_n_most_recent_images(messages, self.only_n_most_recent_images)
+        # Call the API synchronously
+        raw_response = self.client.beta.messages.with_raw_response.create(
+            max_tokens=self.max_tokens,
+            messages=messages,
+            model=self.model,
+            system=self.system,
+            tools=self.tool_collection.to_params(),
+            betas=["computer-use-2024-10-22"],
+        )
+        self.api_response_callback(cast(APIResponse[BetaMessage], raw_response))
+        response = raw_response.parse()
+        print(f"AnthropicActor response: {response}")
+        self.total_token_usage += response.usage.input_tokens + response.usage.output_tokens
+        self.total_cost += (response.usage.input_tokens * 3 / 1000000 + response.usage.output_tokens * 15 / 1000000)
+        if self.print_usage:
+            print(f"Claude total token usage so far: {self.total_token_usage}, total cost so far: $USD{self.total_cost}")
+        return response
+def _maybe_filter_to_n_most_recent_images(
+    messages: list[BetaMessageParam],
+    images_to_keep: int,
+    min_removal_threshold: int = 10,
+):
+    """
+    With the assumption that images are screenshots that are of diminishing value as
+    the conversation progresses, remove all but the final `images_to_keep` tool_result
+    images in place, with a chunk of min_removal_threshold to reduce the amount we
+    break the implicit prompt cache.
+    """
+    if images_to_keep is None:
+        return messages
+    tool_result_blocks = cast(
+        list[ToolResultBlockParam],
+        [
+            item
+            for message in messages
+            for item in (
+                message["content"] if isinstance(message["content"], list) else []
+            )
+            if isinstance(item, dict) and item.get("type") == "tool_result"
+        ],
+    )
+    total_images = sum(
+        1
+        for tool_result in tool_result_blocks
+        for content in tool_result.get("content", [])
+        if isinstance(content, dict) and content.get("type") == "image"
+    )
+    images_to_remove = total_images - images_to_keep
+    # for better cache behavior, we want to remove in chunks
+    images_to_remove -= images_to_remove % min_removal_threshold
+    for tool_result in tool_result_blocks:
+        if isinstance(tool_result.get("content"), list):
+            new_content = []
+            for content in tool_result.get("content", []):
+                if isinstance(content, dict) and content.get("type") == "image":
+                    if images_to_remove > 0:
+                        images_to_remove -= 1
+                        continue
+                new_content.append(content)
+            tool_result["content"] = new_content
+if __name__ == "__main__":
+    pass
+    # client = Anthropic(api_key="")
+    # response = client.beta.messages.with_raw_response.create(
+    #     max_tokens=4096,
+    #     model="claude-3-5-sonnet-20241022",
+    #     system=SYSTEM_PROMPT,
+    #     # tools=ToolCollection(
+    #     #     ComputerTool(selected_screen=0),
+    #     #     BashTool(),
+    #     #     EditTool(),
+    #     # ).to_params(),
+    #     betas=["computer-use-2024-10-22"],
+    #     messages=[
+    #         {"role": "user", "content": "click on (199, 199)."}
+    #     ],
+    # )
+    # print(f"AnthropicActor response: {response.parse().usage.input_tokens+response.parse().usage.output_tokens}")

computer_use_demo/gui_agent/planner/api_vlm_planner.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import json
+import asyncio
+import platform
+from collections.abc import Callable
+from datetime import datetime
+from enum import StrEnum
+from typing import Any, cast, Dict, Callable
+from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
+from anthropic.types import TextBlock, ToolResultBlockParam
+from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam
+from computer_use_demo.tools.screen_capture import get_screenshot
+from computer_use_demo.gui_agent.llm_utils.oai import run_oai_interleaved, run_ssh_llm_interleaved
+from computer_use_demo.gui_agent.llm_utils.qwen import run_qwen
+from computer_use_demo.gui_agent.llm_utils.llm_utils import extract_data, encode_image
+from computer_use_demo.tools.colorful_text import colorful_text_showui, colorful_text_vlm
+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+class APIVLMPlanner:
+    def __init__(
+        self,
+        model: str,
+        provider: str,
+        system_prompt_suffix: str,
+        api_key: str,
+        output_callback: Callable,
+        api_response_callback: Callable,
+        max_tokens: int = 4096,
+        only_n_most_recent_images: int | None = None,
+        selected_screen: int = 0,
+        print_usage: bool = True,
+        device: torch.device = torch.device("cpu"),
+    ):
+        self.device = device
+        if model == "gpt-4o":
+            self.model = "gpt-4o-2024-11-20"
+        elif model == "gpt-4o-mini":
+            self.model = "gpt-4o-mini"  # "gpt-4o-mini"
+        elif model == "qwen2-vl-max":
+            self.model = "qwen2-vl-max"
+        elif model == "qwen2-vl-2b (ssh)":
+            self.model = "Qwen2-VL-2B-Instruct"
+        elif model == "qwen2-vl-7b (ssh)":
+            self.model = "Qwen2-VL-7B-Instruct"
+        elif model == "qwen2.5-vl-7b (ssh)":
+            self.model = "Qwen2.5-VL-7B-Instruct"
+        elif model == "qwen-vl-7b-instruct":  # local model
+            self.model = "qwen-vl-7b-instruct"
+            self.min_pixels = 256 * 28 * 28
+            self.max_pixels = 1344 * 28 * 28
+            self.processor = AutoProcessor.from_pretrained(
+                "./Qwen2-VL-7B-Instruct",
+                min_pixels=self.min_pixels,
+                max_pixels=self.max_pixels
+            )
+        else:
+            raise ValueError(f"Model {model} not supported")
+        self.provider = provider
+        self.system_prompt_suffix = system_prompt_suffix
+        self.api_key = api_key
+        self.api_response_callback = api_response_callback
+        self.max_tokens = max_tokens
+        self.only_n_most_recent_images = only_n_most_recent_images
+        self.selected_screen = selected_screen
+        self.output_callback = output_callback
+        self.system_prompt = self._get_system_prompt() + self.system_prompt_suffix
+        self.print_usage = print_usage
+        self.total_token_usage = 0
+        self.total_cost = 0
+    def __call__(self, messages: list):
+        # drop looping actions msg, byte image etc
+        planner_messages = _message_filter_callback(messages)
+        print(f"filtered_messages: {planner_messages}")
+        if self.only_n_most_recent_images:
+            _maybe_filter_to_n_most_recent_images(planner_messages, self.only_n_most_recent_images)
+        # Take a screenshot
+        screenshot, screenshot_path = get_screenshot(selected_screen=self.selected_screen)
+        screenshot_path = str(screenshot_path)
+        image_base64 = encode_image(screenshot_path)
+        self.output_callback(f'Screenshot for {colorful_text_vlm}:\n<img src="data:image/png;base64,{image_base64}">',
+                             sender="bot")
+        if isinstance(planner_messages[-1], dict):
+            if not isinstance(planner_messages[-1]["content"], list):
+                planner_messages[-1]["content"] = [planner_messages[-1]["content"]]
+            planner_messages[-1]["content"].append(screenshot_path)
+        print(f"Sending messages to VLMPlanner: {planner_messages}")
+        if self.model == "gpt-4o-2024-11-20":
+            vlm_response, token_usage = run_oai_interleaved(
+                messages=planner_messages,
+                system=self.system_prompt,
+                llm=self.model,
+                api_key=self.api_key,
+                max_tokens=self.max_tokens,
+                temperature=0,
+            )
+            print(f"oai token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += (token_usage * 0.15 / 1000000)  # https://openai.com/api/pricing/
+        elif self.model == "qwen2-vl-max":
+            vlm_response, token_usage = run_qwen(
+                messages=planner_messages,
+                system=self.system_prompt,
+                llm=self.model,
+                api_key=self.api_key,
+                max_tokens=self.max_tokens,
+                temperature=0,
+            )
+            print(f"qwen token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += (token_usage * 0.02 / 7.25 / 1000)  # 1USD=7.25CNY, https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api
+        elif "Qwen" in self.model:
+            # 从api_key中解析host和port
+            try:
+                ssh_host, ssh_port = self.api_key.split(":")
+                ssh_port = int(ssh_port)
+            except ValueError:
+                raise ValueError("Invalid SSH connection string. Expected format: host:port")
+            vlm_response, token_usage = run_ssh_llm_interleaved(
+                messages=planner_messages,
+                system=self.system_prompt,
+                llm=self.model,
+                ssh_host=ssh_host,
+                ssh_port=ssh_port,
+                max_tokens=self.max_tokens,
+            )
+        else:
+            raise ValueError(f"Model {self.model} not supported")
+        print(f"VLMPlanner response: {vlm_response}")
+        if self.print_usage:
+            print(f"VLMPlanner total token usage so far: {self.total_token_usage}. Total cost so far: $USD{self.total_cost:.5f}")
+        vlm_response_json = extract_data(vlm_response, "json")
+        # vlm_plan_str = '\n'.join([f'{key}: {value}' for key, value in json.loads(response).items()])
+        vlm_plan_str = ""
+        for key, value in json.loads(vlm_response_json).items():
+            if key == "Thinking":
+                vlm_plan_str += f'{value}'
+            else:
+                vlm_plan_str += f'\n{key}: {value}'
+        self.output_callback(f"{colorful_text_vlm}:\n{vlm_plan_str}", sender="bot")
+        return vlm_response_json
+    def _api_response_callback(self, response: APIResponse):
+        self.api_response_callback(response)
+    def reformat_messages(self, messages: list):
+        pass
+    def _get_system_prompt(self):
+        os_name = platform.system()
+        return f"""
+You are using an {os_name} device.
+You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
+You can only interact with the desktop GUI (no terminal or application menu access).
+You may be given some history plan and actions, this is the response from the previous loop.
+You should carefully consider your plan base on the task, screenshot, and history actions.
+Your available "Next Action" only include:
+- ENTER: Press an enter key.
+- ESCAPE: Press an ESCAPE key.
+- INPUT: Input a string of text.
+- CLICK: Describe the ui element to be clicked.
+- HOVER: Describe the ui element to be hovered.
+- SCROLL: Scroll the screen, you must specify up or down.
+- PRESS: Describe the ui element to be pressed.
+Output format:
+```json
+{{
+    "Thinking": str, # describe your thoughts on how to achieve the task, choose one action from available actions at a time.
+    "Next Action": "action_type, action description" | "None" # one action at a time, describe it in short and precisely.
+}}
+```
+One Example:
+```json
+{{
+    "Thinking": "I need to search and navigate to amazon.com.",
+    "Next Action": "CLICK 'Search Google or type a URL'."
+}}
+```
+IMPORTANT NOTES:
+1. Carefully observe the screenshot to understand the current state and read history actions.
+2. You should only give a single action at a time. for example, INPUT text, and ENTER can't be in one Next Action.
+3. Attach the text to Next Action, if there is text or any description for the button.
+4. You should not include other actions, such as keyboard shortcuts.
+5. When the task is completed, you should say "Next Action": "None" in the json field.
+"""
+def _maybe_filter_to_n_most_recent_images(
+    messages: list[BetaMessageParam],
+    images_to_keep: int,
+    min_removal_threshold: int = 10,
+):
+    """
+    With the assumption that images are screenshots that are of diminishing value as
+    the conversation progresses, remove all but the final `images_to_keep` tool_result
+    images in place, with a chunk of min_removal_threshold to reduce the amount we
+    break the implicit prompt cache.
+    """
+    if images_to_keep is None:
+        return messages
+    tool_result_blocks = cast(
+        list[ToolResultBlockParam],
+        [
+            item
+            for message in messages
+            for item in (
+                message["content"] if isinstance(message["content"], list) else []
+            )
+            if isinstance(item, dict) and item.get("type") == "tool_result"
+        ],
+    )
+    total_images = sum(
+        1
+        for tool_result in tool_result_blocks
+        for content in tool_result.get("content", [])
+        if isinstance(content, dict) and content.get("type") == "image"
+    )
+    images_to_remove = total_images - images_to_keep
+    # for better cache behavior, we want to remove in chunks
+    images_to_remove -= images_to_remove % min_removal_threshold
+    for tool_result in tool_result_blocks:
+        if isinstance(tool_result.get("content"), list):
+            new_content = []
+            for content in tool_result.get("content", []):
+                if isinstance(content, dict) and content.get("type") == "image":
+                    if images_to_remove > 0:
+                        images_to_remove -= 1
+                        continue
+                new_content.append(content)
+            tool_result["content"] = new_content
+def _message_filter_callback(messages):
+    filtered_list = []
+    try:
+        for msg in messages:
+            if msg.get('role') in ['user']:
+                if not isinstance(msg["content"], list):
+                    msg["content"] = [msg["content"]]
+                if isinstance(msg["content"][0], TextBlock):
+                    filtered_list.append(str(msg["content"][0].text))  # User message
+                elif isinstance(msg["content"][0], str):
+                    filtered_list.append(msg["content"][0])  # User message
+                else:
+                    print("[_message_filter_callback]: drop message", msg)
+                    continue
+            # elif msg.get('role') in ['assistant']:
+            #     if isinstance(msg["content"][0], TextBlock):
+            #         msg["content"][0] = str(msg["content"][0].text)
+            #     elif isinstance(msg["content"][0], BetaTextBlock):
+            #         msg["content"][0] = str(msg["content"][0].text)
+            #     elif isinstance(msg["content"][0], BetaToolUseBlock):
+            #         msg["content"][0] = str(msg['content'][0].input)
+            #     elif isinstance(msg["content"][0], Dict) and msg["content"][0]["content"][-1]["type"] == "image":
+            #         msg["content"][0] = f'<img src="data:image/png;base64,{msg["content"][0]["content"][-1]["source"]["data"]}">'
+            #     else:
+            #         print("[_message_filter_callback]: drop message", msg)
+            #         continue
+            #     filtered_list.append(msg["content"][0])  # User message
+            else:
+                print("[_message_filter_callback]: drop message", msg)
+                continue
+    except Exception as e:
+        print("[_message_filter_callback]: error", e)
+    return filtered_list

computer_use_demo/gui_agent/planner/local_vlm_planner.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import json
+import asyncio
+import platform
+from collections.abc import Callable
+from datetime import datetime
+from enum import StrEnum
+from typing import Any, cast, Dict, Callable
+from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
+from anthropic.types import TextBlock, ToolResultBlockParam
+from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam
+from computer_use_demo.tools.screen_capture import get_screenshot
+from computer_use_demo.gui_agent.llm_utils.llm_utils import extract_data, encode_image
+from computer_use_demo.tools.colorful_text import colorful_text_showui, colorful_text_vlm
+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
+* You are utilizing a Windows system with internet access.
+* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
+</SYSTEM_CAPABILITY>
+"""
+class LocalVLMPlanner:
+    def __init__(
+        self,
+        model: str,
+        provider: str,
+        system_prompt_suffix: str,
+        output_callback: Callable,
+        api_response_callback: Callable,
+        max_tokens: int = 4096,
+        only_n_most_recent_images: int | None = None,
+        selected_screen: int = 0,
+        print_usage: bool = True,
+        device: torch.device = torch.device("cpu"),
+    ):
+        self.device = device
+        self.min_pixels = 256 * 28 * 28
+        self.max_pixels = 1344 * 28 * 28
+        if model == "qwen-vl-7b-instruct":  # local model
+            self.model_name = "qwen-vl-7b-instruct"
+            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+                # "Qwen/Qwen2-VL-7B-Instruct",
+                "./Qwen2-VL-7B-Instruct",
+                torch_dtype=torch.bfloat16,
+                device_map="cpu"
+            ).to(self.device)
+            self.processor = AutoProcessor.from_pretrained(
+                "./Qwen2-VL-7B-Instruct",
+                min_pixels=self.min_pixels,
+                max_pixels=self.max_pixels
+            )
+        elif model == "qwen2-vl-2b-instruct":
+            self.model_name = "qwen2-vl-2b-instruct"
+            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+                # "Qwen/Qwen2-VL-2B-Instruct",
+                "./Qwen2-VL-2B-Instruct",
+                torch_dtype=torch.bfloat16,
+                device_map="cpu"
+            ).to(self.device)
+            self.processor = AutoProcessor.from_pretrained(
+                "./Qwen2-VL-2B-Instruct",
+                min_pixels=self.min_pixels,
+                max_pixels=self.max_pixels
+            )
+        else:
+            raise ValueError(f"Model {model} not supported")
+        self.provider = provider
+        self.system_prompt_suffix = system_prompt_suffix
+        self.api_response_callback = api_response_callback
+        self.max_tokens = max_tokens
+        self.only_n_most_recent_images = only_n_most_recent_images
+        self.selected_screen = selected_screen
+        self.output_callback = output_callback
+        self.system_prompt = self._get_system_prompt() + self.system_prompt_suffix
+        self.print_usage = print_usage
+        self.total_token_usage = 0
+        self.total_cost = 0
+    def __call__(self, messages: list):
+        # drop looping actions msg, byte image etc
+        planner_messages = _message_filter_callback(messages)
+        print(f"filtered_messages: {planner_messages}")
+        # Take a screenshot
+        screenshot, screenshot_path = get_screenshot(selected_screen=self.selected_screen)
+        screenshot_path = str(screenshot_path)
+        image_base64 = encode_image(screenshot_path)
+        self.output_callback(f'Screenshot for {colorful_text_vlm}:\n<img src="data:image/png;base64,{image_base64}">',
+                             sender="bot")
+        if isinstance(planner_messages[-1], dict):
+            if not isinstance(planner_messages[-1]["content"], list):
+                planner_messages[-1]["content"] = [planner_messages[-1]["content"]]
+            planner_messages[-1]["content"].append(screenshot_path)
+        print(f"Sending messages to VLMPlanner: {planner_messages}")
+        messages_for_processor = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": self.system_prompt}]
+            },
+            {
+                "role": "user",
+                "content": [
+                {"type": "image", "image": screenshot_path, "min_pixels": self.min_pixels, "max_pixels": self.max_pixels},
+                {"type": "text", "text": f"Task: {''.join(planner_messages)}"}
+            ],
+        }]
+        text = self.processor.apply_chat_template(
+            messages_for_processor, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages_for_processor)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.device)
+        generated_ids = self.model.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        vlm_response = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        print(f"VLMPlanner response: {vlm_response}")
+        vlm_response_json = extract_data(vlm_response, "json")
+        # vlm_plan_str = '\n'.join([f'{key}: {value}' for key, value in json.loads(response).items()])
+        vlm_plan_str = ""
+        for key, value in json.loads(vlm_response_json).items():
+            if key == "Thinking":
+                vlm_plan_str += f'{value}'
+            else:
+                vlm_plan_str += f'\n{key}: {value}'
+        self.output_callback(f"{colorful_text_vlm}:\n{vlm_plan_str}", sender="bot")
+        return vlm_response_json
+    def _api_response_callback(self, response: APIResponse):
+        self.api_response_callback(response)
+    def reformat_messages(self, messages: list):
+        pass
+    def _get_system_prompt(self):
+        os_name = platform.system()
+        return f"""
+You are using an {os_name} device.
+You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
+You can only interact with the desktop GUI (no terminal or application menu access).
+You may be given some history plan and actions, this is the response from the previous loop.
+You should carefully consider your plan base on the task, screenshot, and history actions.
+Your available "Next Action" only include:
+- ENTER: Press an enter key.
+- ESCAPE: Press an ESCAPE key.
+- INPUT: Input a string of text.
+- CLICK: Describe the ui element to be clicked.
+- HOVER: Describe the ui element to be hovered.
+- SCROLL: Scroll the screen, you must specify up or down.
+- PRESS: Describe the ui element to be pressed.
+Output format:
+```json
+{{
+    "Thinking": str, # describe your thoughts on how to achieve the task, choose one action from available actions at a time.
+    "Next Action": "action_type, action description" | "None" # one action at a time, describe it in short and precisely.
+}}
+```
+One Example:
+```json
+{{
+    "Thinking": "I need to search and navigate to amazon.com.",
+    "Next Action": "CLICK 'Search Google or type a URL'."
+}}
+```
+IMPORTANT NOTES:
+1. Carefully observe the screenshot to understand the current state and read history actions.
+2. You should only give a single action at a time. for example, INPUT text, and ENTER can't be in one Next Action.
+3. Attach the text to Next Action, if there is text or any description for the button.
+4. You should not include other actions, such as keyboard shortcuts.
+5. When the task is completed, you should say "Next Action": "None" in the json field.
+"""
+def _message_filter_callback(messages):
+    filtered_list = []
+    try:
+        for msg in messages:
+            if msg.get('role') in ['user']:
+                if not isinstance(msg["content"], list):
+                    msg["content"] = [msg["content"]]
+                if isinstance(msg["content"][0], TextBlock):
+                    filtered_list.append(str(msg["content"][0].text))  # User message
+                elif isinstance(msg["content"][0], str):
+                    filtered_list.append(msg["content"][0])  # User message
+                else:
+                    print("[_message_filter_callback]: drop message", msg)
+                    continue
+            else:
+                print("[_message_filter_callback]: drop message", msg)
+                continue
+    except Exception as e:
+        print("[_message_filter_callback]: error", e)
+    return filtered_list

computer_use_demo/loop.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+Agentic sampling loop that calls the Anthropic API and local implementation of computer use tools.
+"""
+import time
+import json
+from collections.abc import Callable
+from enum import StrEnum
+from anthropic import APIResponse
+from anthropic.types.beta import BetaContentBlock, BetaMessage, BetaMessageParam
+from computer_use_demo.tools import ToolResult
+import torch
+from computer_use_demo.gui_agent.planner.anthropic_agent import AnthropicActor
+from computer_use_demo.executor.anthropic_executor import AnthropicExecutor
+from computer_use_demo.gui_agent.planner.api_vlm_planner import APIVLMPlanner
+from computer_use_demo.gui_agent.planner.local_vlm_planner import LocalVLMPlanner
+from computer_use_demo.gui_agent.actor.showui_agent import ShowUIActor
+from computer_use_demo.executor.showui_executor import ShowUIExecutor
+from computer_use_demo.gui_agent.actor.uitars_agent import UITARS_Actor
+from computer_use_demo.tools.colorful_text import colorful_text_showui, colorful_text_vlm
+from computer_use_demo.tools.screen_capture import get_screenshot
+from computer_use_demo.gui_agent.llm_utils.oai import encode_image
+from computer_use_demo.tools.logger import logger
+class APIProvider(StrEnum):
+    ANTHROPIC = "anthropic"
+    BEDROCK = "bedrock"
+    VERTEX = "vertex"
+    OPENAI = "openai"
+    QWEN = "qwen"
+    SSH = "ssh"
+PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
+    APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
+    APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
+    APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
+    APIProvider.OPENAI: "gpt-4o",
+    APIProvider.QWEN: "qwen2vl",
+    APIProvider.SSH: "qwen2-vl-2b",
+}
+def sampling_loop_sync(
+    *,
+    planner_model: str,
+    planner_provider: APIProvider | None,
+    actor_model: str,
+    actor_provider: APIProvider | None,
+    system_prompt_suffix: str,
+    messages: list[BetaMessageParam],
+    output_callback: Callable[[BetaContentBlock], None],
+    tool_output_callback: Callable[[ToolResult, str], None],
+    api_response_callback: Callable[[APIResponse[BetaMessage]], None],
+    api_key: str,
+    only_n_most_recent_images: int | None = None,
+    max_tokens: int = 4096,
+    selected_screen: int = 0,
+    showui_max_pixels: int = 1344,
+    showui_awq_4bit: bool = False,
+    ui_tars_url: str = ""
+):
+    """
+    Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
+    """
+    # ---------------------------
+    # Initialize Planner
+    # ---------------------------
+    if planner_model == "claude-3-5-sonnet-20241022":
+        # Register Actor and Executor
+        actor = AnthropicActor(
+            model=planner_model,
+            provider=actor_provider,
+            system_prompt_suffix=system_prompt_suffix,
+            api_key=api_key,
+            api_response_callback=api_response_callback,
+            max_tokens=max_tokens,
+            only_n_most_recent_images=only_n_most_recent_images,
+            selected_screen=selected_screen
+        )
+        executor = AnthropicExecutor(
+            output_callback=output_callback,
+            tool_output_callback=tool_output_callback,
+            selected_screen=selected_screen
+        )
+        loop_mode = "unified"
+    elif planner_model in ["gpt-4o", "gpt-4o-mini", "qwen2-vl-max"]:
+        if torch.cuda.is_available(): device = torch.device("cuda")
+        elif torch.backends.mps.is_available(): device = torch.device("mps")
+        else: device = torch.device("cpu") # support: 'cpu', 'mps', 'cuda'
+        logger.info(f"Model inited on device: {device}.")
+        planner = APIVLMPlanner(
+            model=planner_model,
+            provider=planner_provider,
+            system_prompt_suffix=system_prompt_suffix,
+            api_key=api_key,
+            api_response_callback=api_response_callback,
+            selected_screen=selected_screen,
+            output_callback=output_callback,
+            device=device
+        )
+        loop_mode = "planner + actor"
+    elif planner_model == "qwen2-vl-7b-instruct":
+        planner = LocalVLMPlanner(
+            model=planner_model,
+            provider=planner_provider,
+            system_prompt_suffix=system_prompt_suffix,
+            api_key=api_key,
+            api_response_callback=api_response_callback,
+            selected_screen=selected_screen,
+            output_callback=output_callback,
+            device=device
+        )
+        loop_mode = "planner + actor"
+    elif "ssh" in planner_model:
+        if torch.cuda.is_available(): device = torch.device("cuda")
+        elif torch.backends.mps.is_available(): device = torch.device("mps")
+        else: device = torch.device("cpu") # support: 'cpu', 'mps', 'cuda'
+        logger.info(f"Model inited on device: {device}.")
+        planner = APIVLMPlanner(
+            model=planner_model,
+            provider=planner_provider,
+            system_prompt_suffix=system_prompt_suffix,
+            api_key=api_key,
+            api_response_callback=api_response_callback,
+            selected_screen=selected_screen,
+            output_callback=output_callback,
+            device=device
+        )
+        loop_mode = "planner + actor"
+    else:
+        logger.error(f"Planner Model {planner_model} not supported")
+        raise ValueError(f"Planner Model {planner_model} not supported")
+    # ---------------------------
+    # Initialize Actor
+    # ---------------------------
+    if actor_model == "ShowUI":
+        if showui_awq_4bit:
+            showui_model_path = "./showui-2b-awq-4bit/"
+        else:
+            showui_model_path = "./showui-2b/"
+        actor = ShowUIActor(
+            model_path=showui_model_path,
+            device=device,
+            split='desktop',  # 'desktop' or 'phone'
+            selected_screen=selected_screen,
+            output_callback=output_callback,
+            max_pixels=showui_max_pixels,
+            awq_4bit=showui_awq_4bit
+        )
+        executor = ShowUIExecutor(
+            output_callback=output_callback,
+            tool_output_callback=tool_output_callback,
+            selected_screen=selected_screen
+        )
+    elif actor_model == "UI-TARS":
+        actor = UITARS_Actor(
+            ui_tars_url=ui_tars_url,
+            output_callback=output_callback,
+            selected_screen=selected_screen
+        )
+    else:
+        raise ValueError(f"Actor Model {actor_model} not supported")
+    tool_result_content = None
+    showui_loop_count = 0
+    logger.info(f"Start the message loop. User messages: {messages}")
+    if loop_mode == "unified":
+        # ------------------------------
+        # Unified loop: repeatedly call actor -> executor -> check tool_result -> maybe end
+        # ------------------------------
+        while True:
+            # Call the actor with current messages
+            response = actor(messages=messages)
+            # Let the executor process that response, yielding any intermediate messages
+            for message, tool_result_content in executor(response, messages):
+                yield message
+            # If executor didn't produce further content, we're done
+            if not tool_result_content:
+                return messages
+            # If there is more tool content, treat that as user input
+            messages.append({
+                "content": tool_result_content,
+                "role": "user"
+            })
+    elif loop_mode == "planner + actor":
+        # ------------------------------------------------------
+        # Planner + actor loop:
+        # 1) planner => get next_action
+        # 2) If no next_action -> end
+        # 3) Otherwise actor => executor
+        # 4) repeat
+        # ------------------------------------------------------
+        while True:
+            # Step 1: Planner (VLM) response
+            vlm_response = planner(messages=messages)
+            # Step 2: Extract the "Next Action" from the planner output
+            next_action = json.loads(vlm_response).get("Next Action")
+            # Yield the next_action string, in case the UI or logs want to show it
+            yield next_action
+            # Step 3: Check if there are no further actions
+            if not next_action or next_action in ("None", ""):
+                final_sc, final_sc_path = get_screenshot(selected_screen=selected_screen)
+                final_image_b64 = encode_image(str(final_sc_path))
+                output_callback(
+                    (
+                        f"No more actions from {colorful_text_vlm}. End of task. Final State:\n"
+                        f'<img src="data:image/png;base64,{final_image_b64}">'
+                    ),
+                    sender="bot"
+                )
+                yield None
+                break
+            # Step 4: Output an action message
+            output_callback(
+                f"{colorful_text_vlm} sending action to {colorful_text_showui}:\n{next_action}",
+                sender="bot"
+            )
+            # Step 5: Actor response
+            actor_response = actor(messages=next_action)
+            yield actor_response
+            # Step 6: Execute the actor response
+            for message, tool_result_content in executor(actor_response, messages):
+                time.sleep(0.5)  # optional small delay
+                yield message
+            # Step 7: Update conversation with embedding history of plan and actions
+            messages.append({
+                "role": "user",
+                "content": [
+                    "History plan:" + str(json.loads(vlm_response)),
+                    "History actions:" + str(actor_response["content"])
+                ]
+            })
+            logger.info(
+                f"End of loop {showui_loop_count + 1}. "
+                f"Messages: {str(messages)[:100000]}. "
+                f"Total cost: $USD{planner.total_cost:.5f}"
+            )
+            # Increment loop counter
+            showui_loop_count += 1

computer_use_demo/remote_inference.py ADDED Viewed

	@@ -0,0 +1,453 @@

+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, field_validator
+from typing import Optional, List, Union, Dict, Any
+import torch
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen2VLForConditionalGeneration,
+    AutoProcessor,
+    BitsAndBytesConfig
+)
+from qwen_vl_utils import process_vision_info
+import uvicorn
+import json
+from datetime import datetime
+import logging
+import time
+import psutil
+import GPUtil
+import base64
+from PIL import Image
+import io
+import os
+import threading
+# Set environment variables to disable compilation cache and avoid CUDA kernel issues
+os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
+os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0"  # Compatible with A5000
+# Model configuration
+MODELS = {
+    "Qwen2.5-VL-7B-Instruct": {
+        "path": "Qwen/Qwen2.5-VL-7B-Instruct",
+        "model_class": Qwen2_5_VLForConditionalGeneration,
+    },
+    "Qwen2-VL-7B-Instruct": {
+        "path": "Qwen/Qwen2-VL-7B-Instruct",
+        "model_class": Qwen2VLForConditionalGeneration,
+    },
+    "Qwen2-VL-2B-Instruct": {
+        "path": "Qwen/Qwen2-VL-2B-Instruct",
+        "model_class": Qwen2VLForConditionalGeneration,
+    }
+}
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Global variables
+models = {}
+processors = {}
+model_locks = {}  # Thread locks for model loading
+last_used = {}    # Record last use time of models
+# Set default CUDA device
+if torch.cuda.is_available():
+    # Get GPU information and select the device with maximum memory
+    gpus = GPUtil.getGPUs()
+    if gpus:
+        max_memory_gpu = max(gpus, key=lambda g: g.memoryTotal)
+        selected_device = max_memory_gpu.id
+        torch.cuda.set_device(selected_device)
+        device = torch.device(f"cuda:{selected_device}")
+        logger.info(f"Selected GPU {selected_device} ({max_memory_gpu.name}) with {max_memory_gpu.memoryTotal}MB memory")
+    else:
+        device = torch.device("cuda:0")
+else:
+    device = torch.device("cpu")
+logger.info(f"Using device: {device}")
+class ImageURL(BaseModel):
+    url: str
+class MessageContent(BaseModel):
+    type: str
+    text: Optional[str] = None
+    image_url: Optional[Dict[str, str]] = None
+    @field_validator('type')
+    @classmethod
+    def validate_type(cls, v: str) -> str:
+        if v not in ['text', 'image_url']:
+            raise ValueError(f"Invalid content type: {v}")
+        return v
+class ChatMessage(BaseModel):
+    role: str
+    content: Union[str, List[MessageContent]]
+    @field_validator('role')
+    @classmethod
+    def validate_role(cls, v: str) -> str:
+        if v not in ['system', 'user', 'assistant']:
+            raise ValueError(f"Invalid role: {v}")
+        return v
+    @field_validator('content')
+    @classmethod
+    def validate_content(cls, v: Union[str, List[Any]]) -> Union[str, List[MessageContent]]:
+        if isinstance(v, str):
+            return v
+        if isinstance(v, list):
+            return [MessageContent(**item) if isinstance(item, dict) else item for item in v]
+        raise ValueError("Content must be either a string or a list of content items")
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 0.95
+    max_tokens: Optional[int] = 2048
+    stream: Optional[bool] = False
+    response_format: Optional[Dict[str, str]] = None
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str
+    created: int
+    model: str
+    choices: List[Dict[str, Any]]
+    usage: Dict[str, int]
+class ModelCard(BaseModel):
+    id: str
+    created: int
+    owned_by: str
+    permission: List[Dict[str, Any]] = []
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    capabilities: Optional[Dict[str, bool]] = None
+    context_window: Optional[int] = None
+    max_tokens: Optional[int] = None
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard]
+def process_base64_image(base64_string: str) -> Image.Image:
+    """Process base64 image data and return PIL Image"""
+    try:
+        # Remove data URL prefix if present
+        if 'base64,' in base64_string:
+            base64_string = base64_string.split('base64,')[1]
+        image_data = base64.b64decode(base64_string)
+        image = Image.open(io.BytesIO(image_data))
+        # Convert to RGB if necessary
+        if image.mode not in ('RGB', 'L'):
+            image = image.convert('RGB')
+        return image
+    except Exception as e:
+        logger.error(f"Error processing base64 image: {str(e)}")
+        raise ValueError(f"Invalid base64 image data: {str(e)}")
+def log_system_info():
+    """Log system resource information"""
+    try:
+        cpu_percent = psutil.cpu_percent(interval=1)
+        memory = psutil.virtual_memory()
+        gpu_info = []
+        if torch.cuda.is_available():
+            for gpu in GPUtil.getGPUs():
+                gpu_info.append({
+                    'id': gpu.id,
+                    'name': gpu.name,
+                    'load': f"{gpu.load*100}%",
+                    'memory_used': f"{gpu.memoryUsed}MB/{gpu.memoryTotal}MB",
+                    'temperature': f"{gpu.temperature}°C"
+                })
+        logger.info(f"System Info - CPU: {cpu_percent}%, RAM: {memory.percent}%, "
+                   f"Available RAM: {memory.available/1024/1024/1024:.1f}GB")
+        if gpu_info:
+            logger.info(f"GPU Info: {gpu_info}")
+    except Exception as e:
+        logger.warning(f"Failed to log system info: {str(e)}")
+def get_or_initialize_model(model_name: str):
+    """Get or initialize a model if not already loaded"""
+    global models, processors, model_locks, last_used
+    if model_name not in MODELS:
+        available_models = list(MODELS.keys())
+        raise ValueError(f"Unsupported model: {model_name}\nAvailable models: {available_models}")
+    # Initialize lock for the model (if not already done)
+    if model_name not in model_locks:
+        model_locks[model_name] = threading.Lock()
+    with model_locks[model_name]:
+        if model_name not in models or model_name not in processors:
+            try:
+                start_time = time.time()
+                logger.info(f"Starting {model_name} initialization...")
+                log_system_info()
+                model_config = MODELS[model_name]
+                # Configure 8-bit quantization
+                quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                    bnb_4bit_use_double_quant=False,
+                    bnb_4bit_quant_type="nf4",
+                )
+                logger.info(f"Loading {model_name} with 8-bit quantization...")
+                model = model_config["model_class"].from_pretrained(
+                    model_config["path"],
+                    quantization_config=quantization_config,
+                    device_map={"": device.index if device.type == "cuda" else "cpu"},
+                    local_files_only=False
+                ).eval()
+                processor = AutoProcessor.from_pretrained(
+                    model_config["path"],
+                    local_files_only=False
+                )
+                models[model_name] = model
+                processors[model_name] = processor
+                end_time = time.time()
+                logger.info(f"Model {model_name} initialized in {end_time - start_time:.2f} seconds")
+                log_system_info()
+            except Exception as e:
+                logger.error(f"Model initialization error for {model_name}: {str(e)}", exc_info=True)
+                raise RuntimeError(f"Failed to initialize model {model_name}: {str(e)}")
+        # Update last use time
+        last_used[model_name] = time.time()
+        return models[model_name], processors[model_name]
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("Starting application initialization...")
+    try:
+        yield
+    finally:
+        logger.info("Shutting down application...")
+        global models, processors
+        for model_name, model in models.items():
+            try:
+                del model
+                logger.info(f"Model {model_name} unloaded")
+            except Exception as e:
+                logger.error(f"Error during cleanup of {model_name}: {str(e)}")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            logger.info("CUDA cache cleared")
+        models = {}
+        processors = {}
+        logger.info("Shutdown complete")
+app = FastAPI(
+    title="Qwen2.5-VL API",
+    description="OpenAI-compatible API for Qwen2.5-VL vision-language model",
+    version="1.0.0",
+    lifespan=lifespan
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/v1/models", response_model=ModelList)
+async def list_models():
+    """List available models"""
+    model_cards = []
+    for model_name in MODELS.keys():
+        model_cards.append(
+            ModelCard(
+                id=model_name,
+                created=1709251200,
+                owned_by="Qwen",
+                permission=[{
+                    "id": f"modelperm-{model_name}",
+                    "created": 1709251200,
+                    "allow_create_engine": False,
+                    "allow_sampling": True,
+                    "allow_logprobs": True,
+                    "allow_search_indices": False,
+                    "allow_view": True,
+                    "allow_fine_tuning": False,
+                    "organization": "*",
+                    "group": None,
+                    "is_blocking": False
+                }],
+                capabilities={
+                    "vision": True,
+                    "chat": True,
+                    "embeddings": False,
+                    "text_completion": True
+                },
+                context_window=4096,
+                max_tokens=2048
+            )
+        )
+    return ModelList(data=model_cards)
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def chat_completions(request: ChatCompletionRequest):
+    """Handle chat completion requests with vision support"""
+    try:
+        # Get or initialize requested model
+        model, processor = get_or_initialize_model(request.model)
+        request_start_time = time.time()
+        logger.info(f"Received chat completion request for model: {request.model}")
+        logger.info(f"Request content: {request.model_dump_json()}")
+        messages = []
+        for msg in request.messages:
+            if isinstance(msg.content, str):
+                messages.append({"role": msg.role, "content": msg.content})
+            else:
+                processed_content = []
+                for content_item in msg.content:
+                    if content_item.type == "text":
+                        processed_content.append({
+                            "type": "text",
+                            "text": content_item.text
+                        })
+                    elif content_item.type == "image_url":
+                        if "url" in content_item.image_url:
+                            if content_item.image_url["url"].startswith("data:image"):
+                                processed_content.append({
+                                    "type": "image",
+                                    "image": process_base64_image(content_item.image_url["url"])
+                                })
+                messages.append({"role": msg.role, "content": processed_content})
+        text = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        # Ensure input data is on the correct device
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt"
+        )
+        # Move all tensors to specified device
+        input_tensors = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+        with torch.inference_mode():
+            generated_ids = model.generate(
+                **input_tensors,
+                max_new_tokens=request.max_tokens,
+                temperature=request.temperature,
+                top_p=request.top_p,
+                pad_token_id=processor.tokenizer.pad_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id
+            )
+        # Get input length and trim generated IDs
+        input_length = input_tensors['input_ids'].shape[1]
+        generated_ids_trimmed = generated_ids[:, input_length:]
+        response = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        if request.response_format and request.response_format.get("type") == "json_object":
+            try:
+                if response.startswith('```'):
+                    response = '\n'.join(response.split('\n')[1:-1])
+                if response.startswith('json'):
+                    response = response[4:].lstrip()
+                content = json.loads(response)
+                response = json.dumps(content)
+            except json.JSONDecodeError as e:
+                logger.error(f"JSON parsing error: {str(e)}")
+                raise HTTPException(status_code=400, detail=f"Invalid JSON response: {str(e)}")
+        total_time = time.time() - request_start_time
+        logger.info(f"Request completed in {total_time:.2f} seconds")
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{datetime.now().strftime('%Y%m%d%H%M%S')}",
+            object="chat.completion",
+            created=int(datetime.now().timestamp()),
+            model=request.model,
+            choices=[{
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": response
+                },
+                "finish_reason": "stop"
+            }],
+            usage={
+                "prompt_tokens": input_length,
+                "completion_tokens": len(generated_ids_trimmed[0]),
+                "total_tokens": input_length + len(generated_ids_trimmed[0])
+            }
+        )
+    except Exception as e:
+        logger.error(f"Request error: {str(e)}", exc_info=True)
+        if isinstance(e, HTTPException):
+            raise
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    log_system_info()
+    return {
+        "status": "healthy",
+        "loaded_models": list(models.keys()),
+        "device": str(device),
+        "cuda_available": torch.cuda.is_available(),
+        "cuda_device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
+        "timestamp": datetime.now().isoformat()
+    }
+@app.get("/model_status")
+async def model_status():
+    """Get the status of all models"""
+    status = {}
+    for model_name in MODELS:
+        status[model_name] = {
+            "loaded": model_name in models,
+            "last_used": last_used.get(model_name, None),
+            "available": model_name in MODELS
+        }
+    return status
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=9192)

computer_use_demo/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .base import CLIResult, ToolResult
+from .bash import BashTool
+from .collection import ToolCollection
+from .computer import ComputerTool
+from .edit import EditTool
+from .screen_capture import get_screenshot
+__ALL__ = [
+    BashTool,
+    CLIResult,
+    ComputerTool,
+    EditTool,
+    ToolCollection,
+    ToolResult,
+    get_screenshot,
+]

computer_use_demo/tools/base.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass, fields, replace
+from typing import Any
+from anthropic.types.beta import BetaToolUnionParam
+class BaseAnthropicTool(metaclass=ABCMeta):
+    """Abstract base class for Anthropic-defined tools."""
+    @abstractmethod
+    def __call__(self, **kwargs) -> Any:
+        """Executes the tool with the given arguments."""
+        ...
+    @abstractmethod
+    def to_params(
+        self,
+    ) -> BetaToolUnionParam:
+        raise NotImplementedError
+@dataclass(kw_only=True, frozen=True)
+class ToolResult:
+    """Represents the result of a tool execution."""
+    output: str | None = None
+    error: str | None = None
+    base64_image: str | None = None
+    system: str | None = None
+    def __bool__(self):
+        return any(getattr(self, field.name) for field in fields(self))
+    def __add__(self, other: "ToolResult"):
+        def combine_fields(
+            field: str | None, other_field: str | None, concatenate: bool = True
+        ):
+            if field and other_field:
+                if concatenate:
+                    return field + other_field
+                raise ValueError("Cannot combine tool results")
+            return field or other_field
+        return ToolResult(
+            output=combine_fields(self.output, other.output),
+            error=combine_fields(self.error, other.error),
+            base64_image=combine_fields(self.base64_image, other.base64_image, False),
+            system=combine_fields(self.system, other.system),
+        )
+    def replace(self, **kwargs):
+        """Returns a new ToolResult with the given fields replaced."""
+        return replace(self, **kwargs)
+class CLIResult(ToolResult):
+    """A ToolResult that can be rendered as a CLI output."""
+class ToolFailure(ToolResult):
+    """A ToolResult that represents a failure."""
+class ToolError(Exception):
+    """Raised when a tool encounters an error."""
+    def __init__(self, message):
+        self.message = message

computer_use_demo/tools/bash.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import asyncio
+import os
+from typing import ClassVar, Literal
+from anthropic.types.beta import BetaToolBash20241022Param
+from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult
+class _BashSession:
+    """A session of a bash shell."""
+    _started: bool
+    _process: asyncio.subprocess.Process
+    command: str = "/bin/bash"
+    _output_delay: float = 0.2  # seconds
+    _timeout: float = 120.0  # seconds
+    _sentinel: str = "<<exit>>"
+    def __init__(self):
+        self._started = False
+        self._timed_out = False
+    async def start(self):
+        if self._started:
+            return
+        self._process = await asyncio.create_subprocess_shell(
+            self.command,
+            shell=False,
+            stdin=asyncio.subprocess.PIPE,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        self._started = True
+    def stop(self):
+        """Terminate the bash shell."""
+        if not self._started:
+            raise ToolError("Session has not started.")
+        if self._process.returncode is not None:
+            return
+        self._process.terminate()
+    async def run(self, command: str):
+        """Execute a command in the bash shell."""
+        if not self._started:
+            raise ToolError("Session has not started.")
+        if self._process.returncode is not None:
+            return ToolResult(
+                system="tool must be restarted",
+                error=f"bash has exited with returncode {self._process.returncode}",
+            )
+        if self._timed_out:
+            raise ToolError(
+                f"timed out: bash has not returned in {self._timeout} seconds and must be restarted",
+            )
+        # we know these are not None because we created the process with PIPEs
+        assert self._process.stdin
+        assert self._process.stdout
+        assert self._process.stderr
+        # send command to the process
+        self._process.stdin.write(
+            command.encode() + f"; echo '{self._sentinel}'\n".encode()
+        )
+        await self._process.stdin.drain()
+        # read output from the process, until the sentinel is found
+        output = ""
+        try:
+            async with asyncio.timeout(self._timeout):
+                while True:
+                    await asyncio.sleep(self._output_delay)
+                    data = await self._process.stdout.readline()
+                    if not data:
+                        break
+                    line = data.decode()
+                    output += line
+                    if self._sentinel in line:
+                        output = output.replace(self._sentinel, "")
+                        break
+        except asyncio.TimeoutError:
+            self._timed_out = True
+            raise ToolError(
+                f"timed out: bash has not returned in {self._timeout} seconds and must be restarted",
+            ) from None
+        error = await self._process.stderr.read()
+        error = error.decode()
+        return CLIResult(output=output.strip(), error=error.strip())
+class BashTool(BaseAnthropicTool):
+    """
+    A tool that allows the agent to run bash commands.
+    The tool parameters are defined by Anthropic and are not editable.
+    """
+    _session: _BashSession | None
+    name: ClassVar[Literal["bash"]] = "bash"
+    api_type: ClassVar[Literal["bash_20241022"]] = "bash_20241022"
+    def __init__(self):
+        self._session = None
+        super().__init__()
+    async def __call__(
+        self, command: str | None = None, restart: bool = False, **kwargs
+    ):
+        if restart:
+            if self._session:
+                self._session.stop()
+            self._session = _BashSession()
+            await self._session.start()
+            return ToolResult(system="tool has been restarted.")
+        if self._session is None:
+            self._session = _BashSession()
+            await self._session.start()
+        if command is not None:
+            return await self._session.run(command)
+        raise ToolError("no command provided.")
+    def to_params(self) -> BetaToolBash20241022Param:
+        return {
+            "type": self.api_type,
+            "name": self.name,
+        }

computer_use_demo/tools/collection.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Collection classes for managing multiple tools."""
+from typing import Any
+from anthropic.types.beta import BetaToolUnionParam
+from .base import (
+    BaseAnthropicTool,
+    ToolError,
+    ToolFailure,
+    ToolResult,
+)
+class ToolCollection:
+    """A collection of anthropic-defined tools."""
+    def __init__(self, *tools: BaseAnthropicTool):
+        self.tools = tools
+        self.tool_map = {tool.to_params()["name"]: tool for tool in tools}
+    def to_params(
+        self,
+    ) -> list[BetaToolUnionParam]:
+        return [tool.to_params() for tool in self.tools]
+    async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
+        tool = self.tool_map.get(name)
+        if not tool:
+            return ToolFailure(error=f"Tool {name} is invalid")
+        try:
+            return await tool(**tool_input)
+        except ToolError as e:
+            return ToolFailure(error=e.message)
+    def sync_call(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
+        print(f"sync_call: {name} {tool_input}")
+        tool = self.tool_map.get(name)
+        if not tool:
+            return ToolFailure(error=f"Tool {name} is invalid")
+        return tool.sync_call(**tool_input)

computer_use_demo/tools/colorful_text.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Define some colorful stuffs for better visualization in the chat.
+"""
+# Define the RGB colors for each letter
+colors = {
+    'S': 'rgb(106, 158, 210)',
+    'h': 'rgb(111, 163, 82)',
+    'o': 'rgb(209, 100, 94)',
+    'w': 'rgb(238, 171, 106)',
+    'U': 'rgb(0, 0, 0)',
+    'I': 'rgb(0, 0, 0)',
+}
+# Construct the colorful "ShowUI" word
+colorful_text_showui = "**"+''.join(
+    f'<span style="color:{colors.get(letter, "black")}">{letter}</span>'
+    for letter in "ShowUI"
+)+"**"
+colorful_text_vlm = "**VLMPlanner**"
+colorful_text_user = "**User**"
+# print(f"colorful_text_showui: {colorful_text_showui}")
+# **<span style="color:rgb(106, 158, 210)">S</span><span style="color:rgb(111, 163, 82)">h</span><span style="color:rgb(209, 100, 94)">o</span><span style="color:rgb(238, 171, 106)">w</span><span style="color:rgb(0, 0, 0)">U</span><span style="color:rgb(0, 0, 0)">I</span>**

computer_use_demo/tools/computer.py ADDED Viewed

	@@ -0,0 +1,621 @@

+import subprocess
+import platform
+import pyautogui
+import asyncio
+import base64
+import os
+import time
+if platform.system() == "Darwin":
+    import Quartz  # uncomment this line if you are on macOS
+from enum import StrEnum
+from pathlib import Path
+from typing import Literal, TypedDict
+from uuid import uuid4
+from screeninfo import get_monitors
+from PIL import ImageGrab, Image
+from functools import partial
+from anthropic.types.beta import BetaToolComputerUse20241022Param
+from .base import BaseAnthropicTool, ToolError, ToolResult
+from .run import run
+OUTPUT_DIR = "./tmp/outputs"
+TYPING_DELAY_MS = 12
+TYPING_GROUP_SIZE = 50
+Action = Literal[
+    "key",
+    "type",
+    "mouse_move",
+    "left_click",
+    "left_click_drag",
+    "right_click",
+    "middle_click",
+    "double_click",
+    "screenshot",
+    "cursor_position",
+]
+class Resolution(TypedDict):
+    width: int
+    height: int
+MAX_SCALING_TARGETS: dict[str, Resolution] = {
+    "XGA": Resolution(width=1024, height=768),  # 4:3
+    "WXGA": Resolution(width=1280, height=800),  # 16:10
+    "FWXGA": Resolution(width=1366, height=768),  # ~16:9
+}
+class ScalingSource(StrEnum):
+    COMPUTER = "computer"
+    API = "api"
+class ComputerToolOptions(TypedDict):
+    display_height_px: int
+    display_width_px: int
+    display_number: int | None
+def chunks(s: str, chunk_size: int) -> list[str]:
+    return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
+def get_screen_details():
+    screens = get_monitors()
+    screen_details = []
+    # Sort screens by x position to arrange from left to right
+    sorted_screens = sorted(screens, key=lambda s: s.x)
+    # Loop through sorted screens and assign positions
+    primary_index = 0
+    for i, screen in enumerate(sorted_screens):
+        if i == 0:
+            layout = "Left"
+        elif i == len(sorted_screens) - 1:
+            layout = "Right"
+        else:
+            layout = "Center"
+        if screen.is_primary:
+            position = "Primary"
+            primary_index = i
+        else:
+            position = "Secondary"
+        screen_info = f"Screen {i + 1}: {screen.width}x{screen.height}, {layout}, {position}"
+        screen_details.append(screen_info)
+    return screen_details, primary_index
+class ComputerTool(BaseAnthropicTool):
+    """
+    A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
+    Adapted for Windows using 'pyautogui'.
+    """
+    name: Literal["computer"] = "computer"
+    api_type: Literal["computer_20241022"] = "computer_20241022"
+    width: int
+    height: int
+    display_num: int | None
+    _screenshot_delay = 2.0
+    _scaling_enabled = True
+    @property
+    def options(self) -> ComputerToolOptions:
+        width, height = self.scale_coordinates(
+            ScalingSource.COMPUTER, self.width, self.height
+        )
+        return {
+            "display_width_px": width,
+            "display_height_px": height,
+            "display_number": self.display_num,
+        }
+    def to_params(self) -> BetaToolComputerUse20241022Param:
+        return {"name": self.name, "type": self.api_type, **self.options}
+    def __init__(self, selected_screen: int = 0, is_scaling: bool = True):
+        super().__init__()
+        # Get screen width and height using Windows command
+        self.display_num = None
+        self.offset_x = 0
+        self.offset_y = 0
+        self.selected_screen = selected_screen
+        self.is_scaling = is_scaling
+        self.width, self.height = self.get_screen_size()
+        # Path to cliclick
+        self.cliclick = "cliclick"
+        self.key_conversion = {"Page_Down": "pagedown",
+                               "Page_Up": "pageup",
+                               "Super_L": "win",
+                               "Escape": "esc"}
+        self.action_conversion = {"left click": "click",
+                                  "right click": "right_click"}
+        system = platform.system()        # Detect platform
+        if system == "Windows":
+            screens = get_monitors()
+            sorted_screens = sorted(screens, key=lambda s: s.x)
+            if self.selected_screen < 0 or self.selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            screen = sorted_screens[self.selected_screen]
+            bbox = (screen.x, screen.y, screen.x + screen.width, screen.y + screen.height)
+        elif system == "Darwin":  # macOS
+            max_displays = 32  # Maximum number of displays to handle
+            active_displays = Quartz.CGGetActiveDisplayList(max_displays, None, None)[1]
+            screens = []
+            for display_id in active_displays:
+                bounds = Quartz.CGDisplayBounds(display_id)
+                screens.append({
+                    'id': display_id, 'x': int(bounds.origin.x), 'y': int(bounds.origin.y),
+                    'width': int(bounds.size.width), 'height': int(bounds.size.height),
+                    'is_primary': Quartz.CGDisplayIsMain(display_id)  # Check if this is the primary display
+                })
+            sorted_screens = sorted(screens, key=lambda s: s['x'])
+            if self.selected_screen < 0 or self.selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            screen = sorted_screens[self.selected_screen]
+            bbox = (screen['x'], screen['y'], screen['x'] + screen['width'], screen['y'] + screen['height'])
+        else:  # Linux or other OS
+            cmd = "xrandr | grep ' primary' | awk '{print $4}'"
+            try:
+                output = subprocess.check_output(cmd, shell=True).decode()
+                resolution = output.strip().split()[0]
+                width, height = map(int, resolution.split('x'))
+                bbox = (0, 0, width, height)  # Assuming single primary screen for simplicity
+            except subprocess.CalledProcessError:
+                raise RuntimeError("Failed to get screen resolution on Linux.")
+        self.offset_x = screen['x'] if system == "Darwin" else screen.x
+        self.offset_y = screen['y'] if system == "Darwin" else screen.y
+        self.bbox = bbox
+    async def __call__(
+        self,
+        *,
+        action: Action,
+        text: str | None = None,
+        coordinate: tuple[int, int] | None = None,
+        **kwargs,
+    ):
+        print(f"action: {action}, text: {text}, coordinate: {coordinate}")
+        action = self.action_conversion.get(action, action)
+        if action in ("mouse_move", "left_click_drag"):
+            if coordinate is None:
+                raise ToolError(f"coordinate is required for {action}")
+            if text is not None:
+                raise ToolError(f"text is not accepted for {action}")
+            if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
+                raise ToolError(f"{coordinate} must be a tuple of length 2")
+            # if not all(isinstance(i, int) and i >= 0 for i in coordinate):
+            if not all(isinstance(i, int) for i in coordinate):
+                raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
+            if self.is_scaling:
+                x, y = self.scale_coordinates(
+                    ScalingSource.API, coordinate[0], coordinate[1]
+                )
+            else:
+                x, y = coordinate
+            # print(f"scaled_coordinates: {x}, {y}")
+            # print(f"offset: {self.offset_x}, {self.offset_y}")
+            x += self.offset_x
+            y += self.offset_y
+            print(f"mouse move to {x}, {y}")
+            if action == "mouse_move":
+                pyautogui.moveTo(x, y)
+                return ToolResult(output=f"Moved mouse to ({x}, {y})")
+            elif action == "left_click_drag":
+                current_x, current_y = pyautogui.position()
+                pyautogui.dragTo(x, y, duration=0.5)  # Adjust duration as needed
+                return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
+        if action in ("key", "type"):
+            if text is None:
+                raise ToolError(f"text is required for {action}")
+            if coordinate is not None:
+                raise ToolError(f"coordinate is not accepted for {action}")
+            if not isinstance(text, str):
+                raise ToolError(output=f"{text} must be a string")
+            if action == "key":
+                # Handle key combinations
+                keys = text.split('+')
+                for key in keys:
+                    key = self.key_conversion.get(key.strip(), key.strip())
+                    key = key.lower()
+                    pyautogui.keyDown(key)  # Press down each key
+                for key in reversed(keys):
+                    key = self.key_conversion.get(key.strip(), key.strip())
+                    key = key.lower()
+                    pyautogui.keyUp(key)    # Release each key in reverse order
+                return ToolResult(output=f"Pressed keys: {text}")
+            elif action == "type":
+                pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000)  # Convert ms to seconds
+                screenshot_base64 = (await self.screenshot()).base64_image
+                return ToolResult(output=text, base64_image=screenshot_base64)
+        if action in (
+            "left_click",
+            "right_click",
+            "double_click",
+            "middle_click",
+            "screenshot",
+            "cursor_position",
+            "left_press",
+        ):
+            if text is not None:
+                raise ToolError(f"text is not accepted for {action}")
+            if coordinate is not None:
+                raise ToolError(f"coordinate is not accepted for {action}")
+            if action == "screenshot":
+                return await self.screenshot()
+            elif action == "cursor_position":
+                x, y = pyautogui.position()
+                x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
+                return ToolResult(output=f"X={x},Y={y}")
+            else:
+                if action == "left_click":
+                    pyautogui.click()
+                elif action == "right_click":
+                    pyautogui.rightClick()
+                elif action == "middle_click":
+                    pyautogui.middleClick()
+                elif action == "double_click":
+                    pyautogui.doubleClick()
+                elif action == "left_press":
+                    pyautogui.mouseDown()
+                    time.sleep(1)
+                    pyautogui.mouseUp()
+                return ToolResult(output=f"Performed {action}")
+        raise ToolError(f"Invalid action: {action}")
+    def sync_call(
+        self,
+        *,
+        action: Action,
+        text: str | None = None,
+        coordinate: tuple[int, int] | None = None,
+        **kwargs,
+    ):
+        print(f"action: {action}, text: {text}, coordinate: {coordinate}")
+        action = self.action_conversion.get(action, action)
+        if action in ("mouse_move", "left_click_drag"):
+            if coordinate is None:
+                raise ToolError(f"coordinate is required for {action}")
+            if text is not None:
+                raise ToolError(f"text is not accepted for {action}")
+            if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
+                raise ToolError(f"{coordinate} must be a tuple of length 2")
+            # if not all(isinstance(i, int) and i >= 0 for i in coordinate):
+            if not all(isinstance(i, int) for i in coordinate):
+                raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
+            if self.is_scaling:
+                x, y = self.scale_coordinates(
+                    ScalingSource.API, coordinate[0], coordinate[1]
+                )
+            else:
+                x, y = coordinate
+            # print(f"scaled_coordinates: {x}, {y}")
+            # print(f"offset: {self.offset_x}, {self.offset_y}")
+            x += self.offset_x
+            y += self.offset_y
+            print(f"mouse move to {x}, {y}")
+            if action == "mouse_move":
+                pyautogui.moveTo(x, y)
+                return ToolResult(output=f"Moved mouse to ({x}, {y})")
+            elif action == "left_click_drag":
+                current_x, current_y = pyautogui.position()
+                pyautogui.dragTo(x, y, duration=0.5)  # Adjust duration as needed
+                return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
+        if action in ("key", "type"):
+            if text is None:
+                raise ToolError(f"text is required for {action}")
+            if coordinate is not None:
+                raise ToolError(f"coordinate is not accepted for {action}")
+            if not isinstance(text, str):
+                raise ToolError(output=f"{text} must be a string")
+            if action == "key":
+                # Handle key combinations
+                keys = text.split('+')
+                for key in keys:
+                    key = self.key_conversion.get(key.strip(), key.strip())
+                    key = key.lower()
+                    pyautogui.keyDown(key)  # Press down each key
+                for key in reversed(keys):
+                    key = self.key_conversion.get(key.strip(), key.strip())
+                    key = key.lower()
+                    pyautogui.keyUp(key)    # Release each key in reverse order
+                return ToolResult(output=f"Pressed keys: {text}")
+            elif action == "type":
+                pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000)  # Convert ms to seconds
+                return ToolResult(output=text)
+        if action in (
+            "left_click",
+            "right_click",
+            "double_click",
+            "middle_click",
+            "screenshot",
+            "cursor_position",
+            "left_press",
+        ):
+            if text is not None:
+                raise ToolError(f"text is not accepted for {action}")
+            if coordinate is not None:
+                raise ToolError(f"coordinate is not accepted for {action}")
+            elif action == "cursor_position":
+                x, y = pyautogui.position()
+                x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
+                return ToolResult(output=f"X={x},Y={y}")
+            else:
+                if action == "left_click":
+                    pyautogui.click()
+                elif action == "right_click":
+                    pyautogui.rightClick()
+                elif action == "middle_click":
+                    pyautogui.middleClick()
+                elif action == "double_click":
+                    pyautogui.doubleClick()
+                elif action == "left_press":
+                    pyautogui.mouseDown()
+                    time.sleep(1)
+                    pyautogui.mouseUp()
+                return ToolResult(output=f"Performed {action}")
+        raise ToolError(f"Invalid action: {action}")
+    async def screenshot(self):
+        import time
+        time.sleep(1)
+        """Take a screenshot of the current screen and return a ToolResult with the base64 encoded image."""
+        output_dir = Path(OUTPUT_DIR)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        path = output_dir / f"screenshot_{uuid4().hex}.png"
+        ImageGrab.grab = partial(ImageGrab.grab, all_screens=True)
+        # Detect platform
+        system = platform.system()
+        if system == "Windows":
+            # Windows: Use screeninfo to get monitor details
+            screens = get_monitors()
+            # Sort screens by x position to arrange from left to right
+            sorted_screens = sorted(screens, key=lambda s: s.x)
+            if self.selected_screen < 0 or self.selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            screen = sorted_screens[self.selected_screen]
+            bbox = (screen.x, screen.y, screen.x + screen.width, screen.y + screen.height)
+        elif system == "Darwin":  # macOS
+            # macOS: Use Quartz to get monitor details
+            max_displays = 32  # Maximum number of displays to handle
+            active_displays = Quartz.CGGetActiveDisplayList(max_displays, None, None)[1]
+            # Get the display bounds (resolution) for each active display
+            screens = []
+            for display_id in active_displays:
+                bounds = Quartz.CGDisplayBounds(display_id)
+                screens.append({
+                    'id': display_id,
+                    'x': int(bounds.origin.x),
+                    'y': int(bounds.origin.y),
+                    'width': int(bounds.size.width),
+                    'height': int(bounds.size.height),
+                    'is_primary': Quartz.CGDisplayIsMain(display_id)  # Check if this is the primary display
+                })
+            # Sort screens by x position to arrange from left to right
+            sorted_screens = sorted(screens, key=lambda s: s['x'])
+            if self.selected_screen < 0 or self.selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            screen = sorted_screens[self.selected_screen]
+            bbox = (screen['x'], screen['y'], screen['x'] + screen['width'], screen['y'] + screen['height'])
+        else:  # Linux or other OS
+            cmd = "xrandr | grep ' primary' | awk '{print $4}'"
+            try:
+                output = subprocess.check_output(cmd, shell=True).decode()
+                resolution = output.strip().split()[0]
+                width, height = map(int, resolution.split('x'))
+                bbox = (0, 0, width, height)  # Assuming single primary screen for simplicity
+            except subprocess.CalledProcessError:
+                raise RuntimeError("Failed to get screen resolution on Linux.")
+        # Take screenshot using the bounding box
+        screenshot = ImageGrab.grab(bbox=bbox)
+        # Set offsets (for potential future use)
+        self.offset_x = screen['x'] if system == "Darwin" else screen.x
+        self.offset_y = screen['y'] if system == "Darwin" else screen.y
+        print(f"target_dimension {self.target_dimension}")
+        if not hasattr(self, 'target_dimension'):
+            screenshot = self.padding_image(screenshot)
+            self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
+        # Resize if target_dimensions are specified
+        print(f"offset is {self.offset_x}, {self.offset_y}")
+        print(f"target_dimension is {self.target_dimension}")
+        screenshot = screenshot.resize((self.target_dimension["width"], self.target_dimension["height"]))
+        # Save the screenshot
+        screenshot.save(str(path))
+        if path.exists():
+            # Return a ToolResult instance instead of a dictionary
+            return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
+        raise ToolError(f"Failed to take screenshot: {path} does not exist.")
+    def padding_image(self, screenshot):
+        """Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
+        _, height = screenshot.size
+        new_width = height * 16 // 10
+        padding_image = Image.new("RGB", (new_width, height), (255, 255, 255))
+        # padding to top left
+        padding_image.paste(screenshot, (0, 0))
+        return padding_image
+    async def shell(self, command: str, take_screenshot=True) -> ToolResult:
+        """Run a shell command and return the output, error, and optionally a screenshot."""
+        _, stdout, stderr = await run(command)
+        base64_image = None
+        if take_screenshot:
+            # delay to let things settle before taking a screenshot
+            await asyncio.sleep(self._screenshot_delay)
+            base64_image = (await self.screenshot()).base64_image
+        return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
+    def scale_coordinates(self, source: ScalingSource, x: int, y: int):
+        """Scale coordinates to a target maximum resolution."""
+        if not self._scaling_enabled:
+            return x, y
+        ratio = self.width / self.height
+        target_dimension = None
+        for target_name, dimension in MAX_SCALING_TARGETS.items():
+            # allow some error in the aspect ratio - not ratios are exactly 16:9
+            if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
+                if dimension["width"] < self.width:
+                    target_dimension = dimension
+                    self.target_dimension = target_dimension
+                    # print(f"target_dimension: {target_dimension}")
+                break
+        if target_dimension is None:
+            # TODO: currently we force the target to be WXGA (16:10), when it cannot find a match
+            target_dimension = MAX_SCALING_TARGETS["WXGA"]
+            self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
+        # should be less than 1
+        x_scaling_factor = target_dimension["width"] / self.width
+        y_scaling_factor = target_dimension["height"] / self.height
+        if source == ScalingSource.API:
+            if x > self.width or y > self.height:
+                raise ToolError(f"Coordinates {x}, {y} are out of bounds")
+            # scale up
+            return round(x / x_scaling_factor), round(y / y_scaling_factor)
+        # scale down
+        return round(x * x_scaling_factor), round(y * y_scaling_factor)
+    def get_screen_size(self):
+        if platform.system() == "Windows":
+            # Use screeninfo to get primary monitor on Windows
+            screens = get_monitors()
+            # Sort screens by x position to arrange from left to right
+            sorted_screens = sorted(screens, key=lambda s: s.x)
+            if self.selected_screen is None:
+                primary_monitor = next((m for m in get_monitors() if m.is_primary), None)
+                return primary_monitor.width, primary_monitor.height
+            elif self.selected_screen < 0 or self.selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            else:
+                screen = sorted_screens[self.selected_screen]
+                return screen.width, screen.height
+        elif platform.system() == "Darwin":
+            # macOS part using Quartz to get screen information
+            max_displays = 32  # Maximum number of displays to handle
+            active_displays = Quartz.CGGetActiveDisplayList(max_displays, None, None)[1]
+            # Get the display bounds (resolution) for each active display
+            screens = []
+            for display_id in active_displays:
+                bounds = Quartz.CGDisplayBounds(display_id)
+                screens.append({
+                    'id': display_id,
+                    'x': int(bounds.origin.x),
+                    'y': int(bounds.origin.y),
+                    'width': int(bounds.size.width),
+                    'height': int(bounds.size.height),
+                    'is_primary': Quartz.CGDisplayIsMain(display_id)  # Check if this is the primary display
+                })
+            # Sort screens by x position to arrange from left to right
+            sorted_screens = sorted(screens, key=lambda s: s['x'])
+            if self.selected_screen is None:
+                # Find the primary monitor
+                primary_monitor = next((screen for screen in screens if screen['is_primary']), None)
+                if primary_monitor:
+                    return primary_monitor['width'], primary_monitor['height']
+                else:
+                    raise RuntimeError("No primary monitor found.")
+            elif self.selected_screen < 0 or self.selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            else:
+                # Return the resolution of the selected screen
+                screen = sorted_screens[self.selected_screen]
+                return screen['width'], screen['height']
+        else:  # Linux or other OS
+            cmd = "xrandr | grep ' primary' | awk '{print $4}'"
+            try:
+                output = subprocess.check_output(cmd, shell=True).decode()
+                resolution = output.strip().split()[0]
+                width, height = map(int, resolution.split('x'))
+                return width, height
+            except subprocess.CalledProcessError:
+                raise RuntimeError("Failed to get screen resolution on Linux.")
+    def get_mouse_position(self):
+        # TODO: enhance this func
+        from AppKit import NSEvent
+        from Quartz import CGEventSourceCreate, kCGEventSourceStateCombinedSessionState
+        loc = NSEvent.mouseLocation()
+        # Adjust for different coordinate system
+        return int(loc.x), int(self.height - loc.y)
+    def map_keys(self, text: str):
+        """Map text to cliclick key codes if necessary."""
+        # For simplicity, return text as is
+        # Implement mapping if special keys are needed
+        return text

computer_use_demo/tools/edit.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from collections import defaultdict
+from pathlib import Path
+from typing import Literal, get_args
+from anthropic.types.beta import BetaToolTextEditor20241022Param
+from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult
+from .run import maybe_truncate, run
+Command = Literal[
+    "view",
+    "create",
+    "str_replace",
+    "insert",
+    "undo_edit",
+]
+SNIPPET_LINES: int = 4
+class EditTool(BaseAnthropicTool):
+    """
+    An filesystem editor tool that allows the agent to view, create, and edit files.
+    The tool parameters are defined by Anthropic and are not editable.
+    """
+    api_type: Literal["text_editor_20241022"] = "text_editor_20241022"
+    name: Literal["str_replace_editor"] = "str_replace_editor"
+    _file_history: dict[Path, list[str]]
+    def __init__(self):
+        self._file_history = defaultdict(list)
+        super().__init__()
+    def to_params(self) -> BetaToolTextEditor20241022Param:
+        return {
+            "name": self.name,
+            "type": self.api_type,
+        }
+    async def __call__(
+        self,
+        *,
+        command: Command,
+        path: str,
+        file_text: str | None = None,
+        view_range: list[int] | None = None,
+        old_str: str | None = None,
+        new_str: str | None = None,
+        insert_line: int | None = None,
+        **kwargs,
+    ):
+        _path = Path(path)
+        self.validate_path(command, _path)
+        if command == "view":
+            return await self.view(_path, view_range)
+        elif command == "create":
+            if not file_text:
+                raise ToolError("Parameter `file_text` is required for command: create")
+            self.write_file(_path, file_text)
+            self._file_history[_path].append(file_text)
+            return ToolResult(output=f"File created successfully at: {_path}")
+        elif command == "str_replace":
+            if not old_str:
+                raise ToolError(
+                    "Parameter `old_str` is required for command: str_replace"
+                )
+            return self.str_replace(_path, old_str, new_str)
+        elif command == "insert":
+            if insert_line is None:
+                raise ToolError(
+                    "Parameter `insert_line` is required for command: insert"
+                )
+            if not new_str:
+                raise ToolError("Parameter `new_str` is required for command: insert")
+            return self.insert(_path, insert_line, new_str)
+        elif command == "undo_edit":
+            return self.undo_edit(_path)
+        raise ToolError(
+            f'Unrecognized command {command}. The allowed commands for the {self.name} tool are: {", ".join(get_args(Command))}'
+        )
+    def validate_path(self, command: str, path: Path):
+        """
+        Check that the path/command combination is valid.
+        """
+        # Check if its an absolute path
+        if not path.is_absolute():
+            suggested_path = Path("") / path
+            raise ToolError(
+                f"The path {path} is not an absolute path, it should start with `/`. Maybe you meant {suggested_path}?"
+            )
+        # Check if path exists
+        if not path.exists() and command != "create":
+            raise ToolError(
+                f"The path {path} does not exist. Please provide a valid path."
+            )
+        if path.exists() and command == "create":
+            raise ToolError(
+                f"File already exists at: {path}. Cannot overwrite files using command `create`."
+            )
+        # Check if the path points to a directory
+        if path.is_dir():
+            if command != "view":
+                raise ToolError(
+                    f"The path {path} is a directory and only the `view` command can be used on directories"
+                )
+    async def view(self, path: Path, view_range: list[int] | None = None):
+        """Implement the view command"""
+        if path.is_dir():
+            if view_range:
+                raise ToolError(
+                    "The `view_range` parameter is not allowed when `path` points to a directory."
+                )
+            _, stdout, stderr = await run(
+                rf"find {path} -maxdepth 2 -not -path '*/\.*'"
+            )
+            if not stderr:
+                stdout = f"Here's the files and directories up to 2 levels deep in {path}, excluding hidden items:\n{stdout}\n"
+            return CLIResult(output=stdout, error=stderr)
+        file_content = self.read_file(path)
+        init_line = 1
+        if view_range:
+            if len(view_range) != 2 or not all(isinstance(i, int) for i in view_range):
+                raise ToolError(
+                    "Invalid `view_range`. It should be a list of two integers."
+                )
+            file_lines = file_content.split("\n")
+            n_lines_file = len(file_lines)
+            init_line, final_line = view_range
+            if init_line < 1 or init_line > n_lines_file:
+                raise ToolError(
+                    f"Invalid `view_range`: {view_range}. It's first element `{init_line}` should be within the range of lines of the file: {[1, n_lines_file]}"
+                )
+            if final_line > n_lines_file:
+                raise ToolError(
+                    f"Invalid `view_range`: {view_range}. It's second element `{final_line}` should be smaller than the number of lines in the file: `{n_lines_file}`"
+                )
+            if final_line != -1 and final_line < init_line:
+                raise ToolError(
+                    f"Invalid `view_range`: {view_range}. It's second element `{final_line}` should be larger or equal than its first `{init_line}`"
+                )
+            if final_line == -1:
+                file_content = "\n".join(file_lines[init_line - 1 :])
+            else:
+                file_content = "\n".join(file_lines[init_line - 1 : final_line])
+        return CLIResult(
+            output=self._make_output(file_content, str(path), init_line=init_line)
+        )
+    def str_replace(self, path: Path, old_str: str, new_str: str | None):
+        """Implement the str_replace command, which replaces old_str with new_str in the file content"""
+        # Read the file content
+        file_content = self.read_file(path).expandtabs()
+        old_str = old_str.expandtabs()
+        new_str = new_str.expandtabs() if new_str is not None else ""
+        # Check if old_str is unique in the file
+        occurrences = file_content.count(old_str)
+        if occurrences == 0:
+            raise ToolError(
+                f"No replacement was performed, old_str `{old_str}` did not appear verbatim in {path}."
+            )
+        elif occurrences > 1:
+            file_content_lines = file_content.split("\n")
+            lines = [
+                idx + 1
+                for idx, line in enumerate(file_content_lines)
+                if old_str in line
+            ]
+            raise ToolError(
+                f"No replacement was performed. Multiple occurrences of old_str `{old_str}` in lines {lines}. Please ensure it is unique"
+            )
+        # Replace old_str with new_str
+        new_file_content = file_content.replace(old_str, new_str)
+        # Write the new content to the file
+        self.write_file(path, new_file_content)
+        # Save the content to history
+        self._file_history[path].append(file_content)
+        # Create a snippet of the edited section
+        replacement_line = file_content.split(old_str)[0].count("\n")
+        start_line = max(0, replacement_line - SNIPPET_LINES)
+        end_line = replacement_line + SNIPPET_LINES + new_str.count("\n")
+        snippet = "\n".join(new_file_content.split("\n")[start_line : end_line + 1])
+        # Prepare the success message
+        success_msg = f"The file {path} has been edited. "
+        success_msg += self._make_output(
+            snippet, f"a snippet of {path}", start_line + 1
+        )
+        success_msg += "Review the changes and make sure they are as expected. Edit the file again if necessary."
+        return CLIResult(output=success_msg)
+    def insert(self, path: Path, insert_line: int, new_str: str):
+        """Implement the insert command, which inserts new_str at the specified line in the file content."""
+        file_text = self.read_file(path).expandtabs()
+        new_str = new_str.expandtabs()
+        file_text_lines = file_text.split("\n")
+        n_lines_file = len(file_text_lines)
+        if insert_line < 0 or insert_line > n_lines_file:
+            raise ToolError(
+                f"Invalid `insert_line` parameter: {insert_line}. It should be within the range of lines of the file: {[0, n_lines_file]}"
+            )
+        new_str_lines = new_str.split("\n")
+        new_file_text_lines = (
+            file_text_lines[:insert_line]
+            + new_str_lines
+            + file_text_lines[insert_line:]
+        )
+        snippet_lines = (
+            file_text_lines[max(0, insert_line - SNIPPET_LINES) : insert_line]
+            + new_str_lines
+            + file_text_lines[insert_line : insert_line + SNIPPET_LINES]
+        )
+        new_file_text = "\n".join(new_file_text_lines)
+        snippet = "\n".join(snippet_lines)
+        self.write_file(path, new_file_text)
+        self._file_history[path].append(file_text)
+        success_msg = f"The file {path} has been edited. "
+        success_msg += self._make_output(
+            snippet,
+            "a snippet of the edited file",
+            max(1, insert_line - SNIPPET_LINES + 1),
+        )
+        success_msg += "Review the changes and make sure they are as expected (correct indentation, no duplicate lines, etc). Edit the file again if necessary."
+        return CLIResult(output=success_msg)
+    def undo_edit(self, path: Path):
+        """Implement the undo_edit command."""
+        if not self._file_history[path]:
+            raise ToolError(f"No edit history found for {path}.")
+        old_text = self._file_history[path].pop()
+        self.write_file(path, old_text)
+        return CLIResult(
+            output=f"Last edit to {path} undone successfully. {self._make_output(old_text, str(path))}"
+        )
+    def read_file(self, path: Path):
+        """Read the content of a file from a given path; raise a ToolError if an error occurs."""
+        try:
+            return path.read_text()
+        except Exception as e:
+            raise ToolError(f"Ran into {e} while trying to read {path}") from None
+    def write_file(self, path: Path, file: str):
+        """Write the content of a file to a given path; raise a ToolError if an error occurs."""
+        try:
+            path.write_text(file)
+        except Exception as e:
+            raise ToolError(f"Ran into {e} while trying to write to {path}") from None
+    def _make_output(
+        self,
+        file_content: str,
+        file_descriptor: str,
+        init_line: int = 1,
+        expand_tabs: bool = True,
+    ):
+        """Generate output for the CLI based on the content of a file."""
+        file_content = maybe_truncate(file_content)
+        if expand_tabs:
+            file_content = file_content.expandtabs()
+        file_content = "\n".join(
+            [
+                f"{i + init_line:6}\t{line}"
+                for i, line in enumerate(file_content.split("\n"))
+            ]
+        )
+        return (
+            f"Here's the result of running `cat -n` on {file_descriptor}:\n"
+            + file_content
+            + "\n"
+        )

computer_use_demo/tools/logger.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import logging
+def truncate_string(s, max_length=500):
+    """Truncate long strings for concise printing."""
+    if isinstance(s, str) and len(s) > max_length:
+        return s[:max_length] + "..."
+    return s
+# Configure logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)  # Choose your default level (INFO, DEBUG, etc.)
+# Optionally add a console handler if you don't have one already
+if not logger.handlers:
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    formatter = logging.Formatter("[%(levelname)s] %(name)s - %(message)s")
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)

computer_use_demo/tools/run.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Utility to run shell commands asynchronously with a timeout."""
+import asyncio
+TRUNCATED_MESSAGE: str = "<response clipped><NOTE>To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for.</NOTE>"
+MAX_RESPONSE_LEN: int = 16000
+def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN):
+    """Truncate content and append a notice if content exceeds the specified length."""
+    return (
+        content
+        if not truncate_after or len(content) <= truncate_after
+        else content[:truncate_after] + TRUNCATED_MESSAGE
+    )
+async def run(
+    cmd: str,
+    timeout: float | None = 120.0,  # seconds
+    truncate_after: int | None = MAX_RESPONSE_LEN,
+):
+    """Run a shell command asynchronously with a timeout."""
+    process = await asyncio.create_subprocess_shell(
+        cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+    )
+    try:
+        stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout)
+        return (
+            process.returncode or 0,
+            maybe_truncate(stdout.decode(), truncate_after=truncate_after),
+            maybe_truncate(stderr.decode(), truncate_after=truncate_after),
+        )
+    except asyncio.TimeoutError as exc:
+        try:
+            process.kill()
+        except ProcessLookupError:
+            pass
+        raise TimeoutError(
+            f"Command '{cmd}' timed out after {timeout} seconds"
+        ) from exc

computer_use_demo/tools/screen_capture.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import subprocess
+import base64
+from pathlib import Path
+from PIL import ImageGrab
+from uuid import uuid4
+from screeninfo import get_monitors
+import platform
+if platform.system() == "Darwin":
+    import Quartz  # uncomment this line if you are on macOS
+from PIL import ImageGrab
+from functools import partial
+from .base import BaseAnthropicTool, ToolError, ToolResult
+OUTPUT_DIR = "./tmp/outputs"
+def get_screenshot(selected_screen: int = 0, resize: bool = True, target_width: int = 1920, target_height: int = 1080):
+        # print(f"get_screenshot selected_screen: {selected_screen}")
+        # Get screen width and height using Windows command
+        display_num = None
+        offset_x = 0
+        offset_y = 0
+        selected_screen = selected_screen
+        width, height = _get_screen_size()
+        """Take a screenshot of the current screen and return a ToolResult with the base64 encoded image."""
+        output_dir = Path(OUTPUT_DIR)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        path = output_dir / f"screenshot_{uuid4().hex}.png"
+        ImageGrab.grab = partial(ImageGrab.grab, all_screens=True)
+        # Detect platform
+        system = platform.system()
+        if system == "Windows":
+            # Windows: Use screeninfo to get monitor details
+            screens = get_monitors()
+            # Sort screens by x position to arrange from left to right
+            sorted_screens = sorted(screens, key=lambda s: s.x)
+            if selected_screen < 0 or selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            screen = sorted_screens[selected_screen]
+            bbox = (screen.x, screen.y, screen.x + screen.width, screen.y + screen.height)
+        elif system == "Darwin":  # macOS
+            # macOS: Use Quartz to get monitor details
+            max_displays = 32  # Maximum number of displays to handle
+            active_displays = Quartz.CGGetActiveDisplayList(max_displays, None, None)[1]
+            # Get the display bounds (resolution) for each active display
+            screens = []
+            for display_id in active_displays:
+                bounds = Quartz.CGDisplayBounds(display_id)
+                screens.append({
+                    'id': display_id,
+                    'x': int(bounds.origin.x),
+                    'y': int(bounds.origin.y),
+                    'width': int(bounds.size.width),
+                    'height': int(bounds.size.height),
+                    'is_primary': Quartz.CGDisplayIsMain(display_id)  # Check if this is the primary display
+                })
+            # Sort screens by x position to arrange from left to right
+            sorted_screens = sorted(screens, key=lambda s: s['x'])
+            # print(f"Darwin sorted_screens: {sorted_screens}")
+            if selected_screen < 0 or selected_screen >= len(screens):
+                raise IndexError("Invalid screen index.")
+            screen = sorted_screens[selected_screen]
+            bbox = (screen['x'], screen['y'], screen['x'] + screen['width'], screen['y'] + screen['height'])
+        else:  # Linux or other OS
+            cmd = "xrandr | grep ' primary' | awk '{print $4}'"
+            try:
+                output = subprocess.check_output(cmd, shell=True).decode()
+                resolution = output.strip().split()[0]
+                width, height = map(int, resolution.split('x'))
+                bbox = (0, 0, width, height)  # Assuming single primary screen for simplicity
+            except subprocess.CalledProcessError:
+                raise RuntimeError("Failed to get screen resolution on Linux.")
+        # Take screenshot using the bounding box
+        screenshot = ImageGrab.grab(bbox=bbox)
+        # Set offsets (for potential future use)
+        offset_x = screen['x'] if system == "Darwin" else screen.x
+        offset_y = screen['y'] if system == "Darwin" else screen.y
+        # # Resize if
+        if resize:
+            screenshot = screenshot.resize((target_width, target_height))
+        # Save the screenshot
+        screenshot.save(str(path))
+        if path.exists():
+            # Return a ToolResult instance instead of a dictionary
+            return screenshot, path
+        raise ToolError(f"Failed to take screenshot: {path} does not exist.")
+def _get_screen_size(selected_screen: int = 0):
+    if platform.system() == "Windows":
+        # Use screeninfo to get primary monitor on Windows
+        screens = get_monitors()
+        # Sort screens by x position to arrange from left to right
+        sorted_screens = sorted(screens, key=lambda s: s.x)
+        if selected_screen is None:
+            primary_monitor = next((m for m in get_monitors() if m.is_primary), None)
+            return primary_monitor.width, primary_monitor.height
+        elif selected_screen < 0 or selected_screen >= len(screens):
+            raise IndexError("Invalid screen index.")
+        else:
+            screen = sorted_screens[selected_screen]
+            return screen.width, screen.height
+    elif platform.system() == "Darwin":
+        # macOS part using Quartz to get screen information
+        max_displays = 32  # Maximum number of displays to handle
+        active_displays = Quartz.CGGetActiveDisplayList(max_displays, None, None)[1]
+        # Get the display bounds (resolution) for each active display
+        screens = []
+        for display_id in active_displays:
+            bounds = Quartz.CGDisplayBounds(display_id)
+            screens.append({
+                'id': display_id,
+                'x': int(bounds.origin.x),
+                'y': int(bounds.origin.y),
+                'width': int(bounds.size.width),
+                'height': int(bounds.size.height),
+                'is_primary': Quartz.CGDisplayIsMain(display_id)  # Check if this is the primary display
+            })
+        # Sort screens by x position to arrange from left to right
+        sorted_screens = sorted(screens, key=lambda s: s['x'])
+        if selected_screen is None:
+            # Find the primary monitor
+            primary_monitor = next((screen for screen in screens if screen['is_primary']), None)
+            if primary_monitor:
+                return primary_monitor['width'], primary_monitor['height']
+            else:
+                raise RuntimeError("No primary monitor found.")
+        elif selected_screen < 0 or selected_screen >= len(screens):
+            raise IndexError("Invalid screen index.")
+        else:
+            # Return the resolution of the selected screen
+            screen = sorted_screens[selected_screen]
+            return screen['width'], screen['height']
+    else:  # Linux or other OS
+        cmd = "xrandr | grep ' primary' | awk '{print $4}'"
+        try:
+            output = subprocess.check_output(cmd, shell=True).decode()
+            resolution = output.strip().split()[0]
+            width, height = map(int, resolution.split('x'))
+            return width, height
+        except subprocess.CalledProcessError:
+            raise RuntimeError("Failed to get screen resolution on Linux.")

dev-requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+ruff==0.6.7
+pre-commit==3.8.0
+pytest==8.3.3
+pytest-asyncio==0.23.6
+pyautogui==0.9.54
+streamlit>=1.38.0
+anthropic[bedrock,vertex]>=0.37.1
+jsonschema==4.22.0
+boto3>=1.28.57
+google-auth<3,>=2
+gradio>=5.6.0
+screeninfo
+uiautomation
+# make sure to install the correct version of torch (cuda, mps, cpu, etc.)
+torch
+torchvision
+transformers
+qwen-vl-utils
+accelerate
+dashscope
+huggingface_hub

docs/README_cn.md ADDED Viewed

	@@ -0,0 +1,172 @@

+<h2 align="center">
+    <a href="https://computer-use-ootb.github.io">
+        <img src="../assets/ootb_logo.png" alt="Logo" style="display: block; margin: 0 auto; filter: invert(1) brightness(2);">
+    </a>
+</h2>
+<h5 align="center"> 如果你喜欢我们的项目，请在GitHub上为我们加星⭐以获取最新更新。</h5>
+<h5 align=center>
+[![arXiv](https://img.shields.io/badge/Arxiv-2411.10323-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2411.10323)
+[![Project Page](https://img.shields.io/badge/Project_Page-GUI_Agent-blue)](https://computer-use-ootb.github.io)
+[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fshowlab%2Fcomputer_use_ootb&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fshowlab%2Fcomputer_use_ootb&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)
+</h5>
+## <img src="../assets/ootb_icon.png" alt="Star" style="height:25px; vertical-align:middle; filter: invert(1) brightness(2);">  概览
+**Computer Use <span style="color:rgb(106, 158, 210)">O</span><span style="color:rgb(111, 163, 82)">O</span><span style="color:rgb(209, 100, 94)">T</span><span style="color:rgb(238, 171, 106)">B</span>**<img src="../assets/ootb_icon.png" alt="Star" style="height:20px; vertical-align:middle; filter: invert(1) brightness(2);"> 是一个桌面GUI Agent的开箱即用（OOTB）解决方案，包括API支持的 (**Claude 3.5 Computer Use**) 和本地运行的模型 (**<span style="color:rgb(106, 158, 210)">S</span><span style="color:rgb(111, 163, 82)">h</span><span style="color:rgb(209, 100, 94)">o</span><span style="color:rgb(238, 171, 106)">w</span>UI**)。
+**无需Docker**，支持 **Windows** 和 **macOS**。本项目提供了一个基于Gradio的用户友好界面。🎨
+想了解更多信息，请访问我们关于Claude 3.5 Computer Use的研究 [[项目页面]](https://computer-use-ootb.github.io)。🌐
+## 更新
+- **<span style="color:rgb(231, 183, 98)">重大更新！</span> [2024/12/04]** **本地运行🔥** 已上线！欢迎使用 [**<span style="color:rgb(106, 158, 210)">S</span><span style="color:rgb(111, 163, 82)">h</span><span style="color:rgb(209, 100, 94)">o</span><span style="color:rgb(238, 171, 106)">w</span>UI**](https://github.com/showlab/ShowUI)，一个开源的2B视觉-语言-动作(VLA)模型作为GUI Agent。现在可兼容 `"gpt-4o + ShowUI" (~便宜200倍)`* 及 `"Qwen2-VL + ShowUI" (~便宜30倍)`*，只需几美分💰! <span style="color: grey; font-size: small;">*与Claude Computer Use相比</span>。
+- **[2024/11/20]** 我们添加了一些示例来帮助你上手Claude 3.5 Computer Use。
+- **[2024/11/19]** 不再受Anthropic单显示器限制——现在你可以使用 **多显示器** 🎉！
+- **[2024/11/18]** 我们发布了Claude 3.5 Computer Use的深度分析: [https://arxiv.org/abs/2411.10323](https://arxiv.org/abs/2411.10323)。
+- **[2024/11/11]** 不再受Anthropic低分辨率显示限制——你可以使用 *任意分辨率* 同时保持 **截图token成本较低** 🎉！
+- **[2024/11/11]** 现在 **Windows** 和 **macOS** 两个平台均已支持 🎉！
+- **[2024/10/25]** 现在你可以通过手机设备 📱 **远程控制** 你的电脑 💻——**无需在手机上安装APP**！试试吧，玩得开心 🎉。
+## 演示视频
+https://github.com/user-attachments/assets/f50b7611-2350-4712-af9e-3d31e30020ee
+<div style="display: flex; justify-content: space-around;">
+  <a href="https://youtu.be/Ychd-t24HZw" target="_blank" style="margin-right: 10px;">
+    <img src="https://img.youtube.com/vi/Ychd-t24HZw/maxresdefault.jpg" alt="Watch the video" width="48%">
+  </a>
+  <a href="https://youtu.be/cvgPBazxLFM" target="_blank">
+    <img src="https://img.youtube.com/vi/cvgPBazxLFM/maxresdefault.jpg" alt="Watch the video" width="48%">
+  </a>
+</div>
+## 🚀 开始使用
+### 0. 前置条件
+- 请通过此[链接](https://www.anaconda.com/download?utm_source=anacondadocs&utm_medium=documentation&utm_campaign=download&utm_content=topnavalldocs)安装 Miniconda。（**Python版本：≥3.11**）
+- 硬件要求（可选，针对ShowUI本地运行）：
+    - **Windows (支持CUDA)**: 有CUDA支持的NVIDIA GPU，GPU显存≥6GB
+    - **macOS (Apple Silicon)**: M1芯片（或更新），统一RAM≥16GB
+### 1. 克隆仓库 📂
+打开Conda终端。（安装Miniconda后，将在开始菜单出现）
+在 **Conda终端** 中运行以下命令：
+```bash
+git clone https://github.com/showlab/computer_use_ootb.git
+cd computer_use_ootb
+```
+### 2.1 安装依赖 🔧
+```
+pip install -r dev-requirements.txt
+```
+### 2.2 （可选）为 **<span style="color:rgb(106, 158, 210)">S</span><span style="color:rgb(111, 163, 82)">h</span><span style="color:rgb(209, 100, 94)">o</span><span style="color:rgb(238, 171, 106)">w</span>UI** 本地运行做准备
+1. 使用以下命令下载 ShowUI-2B 模型的所有文件。确保 ShowUI-2B 文件夹位于 computer_use_ootb 文件夹下。
+```
+python install_showui.py
+```
+2. 在您的机器上安装正确的 GPU 版 PyTorch（CUDA、MPS 等）。请参考 [安装指南与验证](https://pytorch.org/get-started/locally/)。
+3. 获取 [GPT-4o](https://platform.openai.com/docs/quickstart) 或 [Qwen-VL](https://help.aliyun.com/zh/dashscope/developer-reference/acquisition-and-configuration-of-api-key) 的 API Key。对于中国大陆用户，可享受 Qwen API 免费试用 100 万token：[点击查看](https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api)。
+### 3. 启动界面 ▶️
+**启动 OOTB 界面：**
+```
+python app.py
+```
+若成功启动界面，您将在终端中看到两个 URL：
+```
+* Running on local URL:  http://127.0.0.1:7860
+* Running on public URL: https://xxxxxxxxxxxxxxxx.gradio.live (请勿与他人分享此链接，否则他们可控制您的电脑。)
+```
+> <u>为方便起见</u>，我们推荐在启动界面前运行以下命令，将 API 密钥设置为环境变量。这样您无需在每次运行时手动输入。
+在 Windows Powershell 中（如在 cmd 中则使用 set 命令）：
+>
+```
+$env:ANTHROPIC_API_KEY="sk-xxxxx" (替换为您的密钥)
+$env:QWEN_API_KEY="sk-xxxxx"
+$env:OPENAI_API_KEY="sk-xxxxx"
+```
+> 在 macOS/Linux 中，将上述命令中的 $env:ANTHROPIC_API_KEY 替换为 export ANTHROPIC_API_KEY 即可。
+### 4. 使用任意可访问网络的设备控制您的电脑
+- **待控制的电脑**：安装了上述软件的那台电脑。
+- **发送指令的设备**：打开网址的任意设备。
+在本机浏览器中打开 http://localhost:7860/（若在本机控制）或在您的手机浏览器中打开 https://xxxxxxxxxxxxxxxxx.gradio.live（若远程控制）。
+输入 Anthropic API 密钥（可通过[此页面](https://console.anthropic.com/settings/keys)获取），然后给出指令让 AI 执行任务。
+<div style="display: flex; align-items: center; gap: 10px;">
+  <figure style="text-align: center;">
+    <img src="./assets/gradio_interface.png" alt="Desktop Interface" style="width: auto; object-fit: contain;">
+  </figure>
+</div>
+## 🖥️ 支持的系统
+- **Windows** (Claude ✅, ShowUI ✅)
+- **macOS** (Claude ✅, ShowUI ✅)
+## ⚠️ 风险
+- **模型可能执行危险操作**：模型仍有局限性，可能生成非预期或潜在有害的输出。建议持续监督 AI 的操作。
+- **成本控制**：每个任务可能花费几美元（Claude 3.5 Computer Use）。💸
+## 📅 路线图
+- [ ] **探索可用功能**
+  - [ ] Claude API 在解决任务时似乎不稳定。我们正在调查原因：分辨率、操作类型、操作系统平台或规划机制等。欢迎提出想法或评论。
+- [ ] **界面设计**
+  - [x] **支持 Gradio** ✨
+  - [ ] **更简单的安装流程**
+  - [ ] **更多特性**... 🚀
+- [ ] **平台**
+  - [x] **Windows**
+  - [x] **移动端**（发出指令）
+  - [x] **macOS**
+  - [ ] **移动端**（被控制）
+- [ ] **支持更多多模态大模型（MLLMs）**
+  - [x] **Claude 3.5 Sonnet** 🎵
+  - [x] **GPT-4o**
+  - [x] **Qwen2-VL**
+  - [ ] ...
+- [ ] **改进提示策略**
+  - [ ] 优化提示以降低成本。💡
+- [ ] **提升推理速度**
+  - [ ] 支持 int8 量化。
+## 加入讨论
+欢迎加入讨论，与我们一同不断改进 Computer Use - OOTB 的用户体验。可通过 [**Discord 频道**](https://discord.gg/HnHng5de) 或下方微信二维码联系我们！
+<div style="display: flex; flex-direction: row; justify-content: space-around;">
+<img src="../assets/wechat_2.jpg" alt="gradio_interface" width="30%">
+<img src="../assets/wechat.jpg" alt="gradio_interface" width="30%">
+</div>
+<div style="height: 30px;"></div>
+<hr>
+<a href="https://computer-use-ootb.github.io">
+<img src="../assets/ootb_logo.png" alt="Logo" width="30%" style="display: block; margin: 0 auto; filter: invert(1) brightness(2);">
+</a>

install_tools/install_showui-awq-4bit.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+from huggingface_hub import hf_hub_download, list_repo_files
+# Specify the model repository and destination folder
+model_repo = "yyyang/showui-2b-awq"
+destination_folder = "./showui-2b-awq"
+# Ensure the destination folder exists
+os.makedirs(destination_folder, exist_ok=True)
+# List all files in the repository
+files = list_repo_files(repo_id=model_repo)
+# Download each file to the destination folder
+for file in files:
+    file_path = hf_hub_download(repo_id=model_repo, filename=file, local_dir=destination_folder)
+    print(f"Downloaded {file} to {file_path}")

install_tools/install_showui.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+from huggingface_hub import hf_hub_download, list_repo_files
+# Specify the model repository and destination folder
+model_repo = "showlab/ShowUI-2B"
+destination_folder = "./showui-2b"
+# Ensure the destination folder exists
+os.makedirs(destination_folder, exist_ok=True)
+# List all files in the repository
+files = list_repo_files(repo_id=model_repo)
+# Download each file to the destination folder
+for file in files:
+    file_path = hf_hub_download(repo_id=model_repo, filename=file, local_dir=destination_folder)
+    print(f"Downloaded {file} to {file_path}")

install_tools/install_uitars-2b-sft.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+from huggingface_hub import hf_hub_download, list_repo_files
+# Specify the model repository and destination folder
+model_repo = "bytedance-research/UI-TARS-2B-SFT"
+destination_folder = "./ui-tars-2b-sft"
+# Ensure the destination folder exists
+os.makedirs(destination_folder, exist_ok=True)
+# List all files in the repository
+files = list_repo_files(repo_id=model_repo)
+# Download each file to the destination folder
+for file in files:
+    file_path = hf_hub_download(repo_id=model_repo, filename=file, local_dir=destination_folder)
+    print(f"Downloaded {file} to {file_path}")

install_tools/test_ui-tars_server.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from openai import OpenAI
+from computer_use_demo.gui_agent.llm_utils.oai import encode_image
+_NAV_SYSTEM_GROUNDING = """
+You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+## Output Format
+```Action: ...```
+## Action Space
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+hotkey(key='')
+type(content='') #If you want to submit your input, use \"\" at the end of `content`.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait() #Sleep for 5s and take a screenshot to check for any changes.
+finished()
+call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
+## Note
+- Do not generate any other text.
+"""
+def get_prompt_grounding(task):
+    return f"""{task}"""
+task = """
+```json
+{{  "Observation": "I am on the google homepage of the Chrome browser.",
+    "Thinking": "The user wants to buy a lap-top on Amazon.com, so I need to click on the address (search) bar of Chrome for entering the 'Amazon.com'.",
+    "Next Action": ["I need to click DSML"],
+    "Expectation": "The search button is activated after being clicked, ready to input."
+}}```
+"""
+task = """
+```json
+{{
+"Observation": "I am on the google homepage of the Chrome browser.",
+"Thinking": "The user wants to click DSML",
+"Next Action": ["I need to click DSML"],
+}}```
+"""
+task = """
+```json
+{{
+"Observation": "I am on the google homepage of the Chrome browser.",
+"Thinking": "The user wants to click Youtube",
+"Next Action": ["I need to click Youtube"],
+}}```
+"""
+if __name__ == "__main__":
+    ui_tars_url = "https://your_api_to_uitars.com/v1"
+    ui_tars_client = OpenAI(base_url=ui_tars_url, api_key="")
+    grounding_system_prompt = _NAV_SYSTEM_GROUNDING.format()
+    screenshot_base64 = encode_image("./chrome.png")
+    prompted_message = get_prompt_grounding(task)
+    print(f"grounding_system_prompt, {grounding_system_prompt}, \
+            prompted_message: {prompted_message}")
+    response = ui_tars_client.chat.completions.create(
+        model="ui-tars",
+        messages=[
+            {"role": "user", "content": grounding_system_prompt},
+            {"role": "user", "content": [
+                {"type": "text", "text": prompted_message},
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_base64}"}}
+                ]
+            },
+            ],
+        max_tokens=128,
+        temperature=0
+        )
+    ui_tars_action = response.choices[0].message.content
+    print(response.choices[0].message.content)