Computer-use-agents · bofei5675 · Jun 5, 2025 · Jun 5, 2025
diff --git a/macosagent/agents/wechat_agent/agent/prompt.py b/macosagent/agents/wechat_agent/agent/prompt.py
@@ -68,15 +68,15 @@
 - Keep track of the status and subresults in the memory. 
 
 9. Action specific rules:
-- Scrolling: When performing a scroll action, use the "scroll" action. 
-    - The `amount` parameter follows the rules of `pyautogui.scroll(amount)`:  
-      - A **positive** value scrolls **up**.  
-      - A **negative** value scrolls **down**.  
-      - The **magnitude** of the `amount` value determines the scroll distance.  
-      - The value of the 'amount' parameter should be between -10 and 10 for a single scroll action to ensure it is manageable and does not cause unexpected behavior.
+- Scrolling: When performing a scroll action, use either the "scroll_up" or "scroll_down" action depending on direction. 
+    - The `amount` parameter must be a positive integer indicating the scroll distance.
+    - The direction is determined by the action type:
+        - `scroll_up`: Scrolls the element up (content moves down)
+        - `scroll_down`: Scrolls the element down (content moves up)
+    - Recommended value range for `amount` is between 1 and 10 to ensure smooth behavior.
     - Example usages:  
-      - **Scroll up (move page content down)**: `{"scroll": {"index": <target_element_index>, "amount": 10}}`  
-      - **Scroll down (move page content up)**: `{"scroll": {"index": <target_element_index>, "amount": -10}}`  
+        - Scroll up: `{"scroll_up": {"index": <target_element_index>, "amount": 5}}`  
+        - Scroll down: `{"scroll_down": {"index": <target_element_index>, "amount": 5}}` 
 
 - Inputting Emojis: When inputting emojis, use the "inputs" action.
     - An example format is "{inputs: {'index': <target_element_index>, "text": "😊"}}". 
@@ -92,9 +92,15 @@
     - Adding new content:   
         - Example: `{"inputs": {"index": <target_element_index>, "text": "<existing content> new text"}}`
 
-- Summarizing and Inputting Content:  
-    - When required to summarize content, the agent must first extract key points and then input the summary into the designated field using the "input" action.
-    - Example: `{"inputs": {"index": <target_element_index>, "text": "<summary of extracted content>"}}`
+- Extracting Content: When the task requires analyzing or extracting information from the screen (e.g. summarizing a message, identifying key elements, or fulfilling user-specific requirements), use the "extract_content" action.
+    - Provide a clear extraction requirement in the "target" field to guide what should be extracted.
+    - Provide the source text or raw content in the "content" field.
+    - The result of this action will be stored in memory and included in the reasoning chain for follow-up actions.
+    - Example: 
+        {"extract_content": {
+            "target": "Summarize the message for forwarding",
+            "content": "Hi! Here is the updated schedule: Monday - Team Meeting at 10am; Tuesday - Client call at 2pm. Let me know if you're available."
+        }}
 
 - Sending Files: When sending files, always assume the file is already in the clipboard.
     - Use the "paste" action to paste the file, followed by the "send" action to confirm sending.

diff --git a/macosagent/agents/wechat_agent/controller/service.py b/macosagent/agents/wechat_agent/controller/service.py
@@ -11,7 +11,7 @@
 
 from macosagent.agents.wechat_agent.controller.registry.service import Registry
 from macosagent.agents.wechat_agent.wechat.context import WechatContext
-from macosagent.agents.wechat_agent.controller.views import ClickElementAction, InputAction, DoneAction, ScrollAction, PasteAction, CopyAction, RightClickElementAction, SendAction
+from macosagent.agents.wechat_agent.controller.views import ClickElementAction, InputAction, DoneAction, ScrollAction, PasteAction, CopyAction, RightClickElementAction, SendAction, ExtractContent
 from macosagent.agents.wechat_agent.agent.views import ActionResult, ActionModel
 #from macosagent.agents.wechat_agent.agent.llm import create_gpt_4o_tool  
 
@@ -136,27 +136,67 @@ async def inputs(params: InputAction, context: WechatContext):
                 return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True)
 
         @self.registry.action(
-            "Scroll element",
+            "Scroll element up",
             param_model=ScrollAction,
         )
-        async def scroll(params: ScrollAction, context: WechatContext):
+        async def scroll_up(params: ScrollAction, context: WechatContext):
             index = params.index
             element = [item for item in context.state.accessibility_tree_json if item["id"] == index]
             x, y, w, h = element[0]["bbox"]
             offset = context.state.offset
-            x = int(x + offset[0] + w/2)
-            y = int(y + offset[1] + h/2)
-            logger.info(f"Scroll element {element[0]}")
+            x = int(x + offset[0] + w / 2)
+            y = int(y + offset[1] + h / 2)
             try:
-                amount = params.amount if params.amount is not None else 0  # Default to scrolling one page if amount is None
-                logger.info(f"Scrolling {amount} pixels")
-                # pyautogui.click(x=x, y=y, clicks=1, interval=0.1, button='left')
+                amount = -abs(params.amount) if params.amount is not None else -10
+                logger.info(f"Scroll element {element[0]} up")
                 pyautogui.moveTo(x, y, duration=0.1)
-                pyautogui.scroll(amount)  # Use pyautogui to scroll by 'amount' pixels
-                return ActionResult(is_done=False, success=True, extracted_content=f"🌀  Scrolled {amount} pixels", include_in_memory=True)
+                pyautogui.scroll(amount)
+                return ActionResult(
+                    is_done=False,
+                    success=True,
+                    extracted_content=f"🌀 Scrolled up {abs(amount)} pixels",
+                    include_in_memory=True,
+                )
             except Exception as e:
-                logger.error(f"Error scrolling: {e}")
-                return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True)
+                logger.error(f"Error scrolling up: {e}")
+                return ActionResult(
+                    is_done=False,
+                    success=False,
+                    extracted_content=str(e),
+                    include_in_memory=True,
+                )
+
+        @self.registry.action(
+            "Scroll element down",
+            param_model=ScrollAction,
+        )
+        async def scroll_down(params: ScrollAction, context: WechatContext):
+            index = params.index
+            element = [item for item in context.state.accessibility_tree_json if item["id"] == index]
+            x, y, w, h = element[0]["bbox"]
+            offset = context.state.offset
+            x = int(x + offset[0] + w / 2)
+            y = int(y + offset[1] + h / 2)
+            try:
+                amount = abs(params.amount) if params.amount is not None else 10
+                logger.info(f"Scroll element {element[0]} down")
+                pyautogui.moveTo(x, y, duration=0.1)
+                pyautogui.scroll(amount)
+                return ActionResult(
+                    is_done=False,
+                    success=True,
+                    extracted_content=f"🌀 Scrolled down {abs(amount)} pixels",
+                    include_in_memory=True,
+                )
+            except Exception as e:
+                logger.error(f"Error scrolling down: {e}")
+                return ActionResult(
+                    is_done=False,
+                    success=False,
+                    extracted_content=str(e),
+                    include_in_memory=True,
+                )
+
 
         @self.registry.action(
             "Paste clipboard text",
@@ -237,6 +277,23 @@ async def send(params: SendAction, context: WechatContext):
                 logger.error(f"Error pasting text: {e}")
                 return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True)
 
+        @self.registry.action(
+            "Extract content",
+            param_model=ExtractContent,
+        )
+        async def extract_content(params: ExtractContent, context: WechatContext):
+            try:
+                target = params.target
+                content = params.content
+
+                logger.info(f"Extracted content based on the requirement '{target}': {content}")
+
+                return ActionResult(is_done=False, success=True, extracted_content=f"📋 Extracted content based on the requirement '{target}': {content}", include_in_memory=True)
+
+            except Exception as e:
+                logger.error(f"Error extracting content based on the requirement '{target}': {e}")
+                return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True)
+
         # @self.registry.action(
         #     "Modify clipboard text",
         #     param_model=ModifyClipboardTextAction,

diff --git a/macosagent/agents/wechat_agent/controller/views.py b/macosagent/agents/wechat_agent/controller/views.py
@@ -31,6 +31,10 @@ class CopyAction(BaseModel):
 class SendAction(BaseModel):
 	index: int
 
+class ExtractContent(BaseModel):
+	target: str
+	content: str
+
 # class ModifyClipboardTextAction(BaseModel):
 # 	requirement_text: str