diff --git a/macosagent/agents/wechat_agent/agent/prompt.py b/macosagent/agents/wechat_agent/agent/prompt.py index ebb616a..cddea39 100644 --- a/macosagent/agents/wechat_agent/agent/prompt.py +++ b/macosagent/agents/wechat_agent/agent/prompt.py @@ -68,15 +68,15 @@ - Keep track of the status and subresults in the memory. 9. Action specific rules: -- Scrolling: When performing a scroll action, use the "scroll" action. - - The `amount` parameter follows the rules of `pyautogui.scroll(amount)`: - - A **positive** value scrolls **up**. - - A **negative** value scrolls **down**. - - The **magnitude** of the `amount` value determines the scroll distance. - - The value of the 'amount' parameter should be between -10 and 10 for a single scroll action to ensure it is manageable and does not cause unexpected behavior. +- Scrolling: When performing a scroll action, use either the "scroll_up" or "scroll_down" action depending on direction. + - The `amount` parameter must be a positive integer indicating the scroll distance. + - The direction is determined by the action type: + - `scroll_up`: Scrolls the element up (content moves down) + - `scroll_down`: Scrolls the element down (content moves up) + - Recommended value range for `amount` is between 1 and 10 to ensure smooth behavior. - Example usages: - - **Scroll up (move page content down)**: `{"scroll": {"index": , "amount": 10}}` - - **Scroll down (move page content up)**: `{"scroll": {"index": , "amount": -10}}` + - Scroll up: `{"scroll_up": {"index": , "amount": 5}}` + - Scroll down: `{"scroll_down": {"index": , "amount": 5}}` - Inputting Emojis: When inputting emojis, use the "inputs" action. - An example format is "{inputs: {'index': , "text": "😊"}}". @@ -92,9 +92,15 @@ - Adding new content: - Example: `{"inputs": {"index": , "text": " new text"}}` -- Summarizing and Inputting Content: - - When required to summarize content, the agent must first extract key points and then input the summary into the designated field using the "input" action. - - Example: `{"inputs": {"index": , "text": ""}}` +- Extracting Content: When the task requires analyzing or extracting information from the screen (e.g. summarizing a message, identifying key elements, or fulfilling user-specific requirements), use the "extract_content" action. + - Provide a clear extraction requirement in the "target" field to guide what should be extracted. + - Provide the source text or raw content in the "content" field. + - The result of this action will be stored in memory and included in the reasoning chain for follow-up actions. + - Example: + {"extract_content": { + "target": "Summarize the message for forwarding", + "content": "Hi! Here is the updated schedule: Monday - Team Meeting at 10am; Tuesday - Client call at 2pm. Let me know if you're available." + }} - Sending Files: When sending files, always assume the file is already in the clipboard. - Use the "paste" action to paste the file, followed by the "send" action to confirm sending. diff --git a/macosagent/agents/wechat_agent/controller/service.py b/macosagent/agents/wechat_agent/controller/service.py index f6bca7b..d52b347 100644 --- a/macosagent/agents/wechat_agent/controller/service.py +++ b/macosagent/agents/wechat_agent/controller/service.py @@ -11,7 +11,7 @@ from macosagent.agents.wechat_agent.controller.registry.service import Registry from macosagent.agents.wechat_agent.wechat.context import WechatContext -from macosagent.agents.wechat_agent.controller.views import ClickElementAction, InputAction, DoneAction, ScrollAction, PasteAction, CopyAction, RightClickElementAction, SendAction +from macosagent.agents.wechat_agent.controller.views import ClickElementAction, InputAction, DoneAction, ScrollAction, PasteAction, CopyAction, RightClickElementAction, SendAction, ExtractContent from macosagent.agents.wechat_agent.agent.views import ActionResult, ActionModel #from macosagent.agents.wechat_agent.agent.llm import create_gpt_4o_tool @@ -136,27 +136,67 @@ async def inputs(params: InputAction, context: WechatContext): return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True) @self.registry.action( - "Scroll element", + "Scroll element up", param_model=ScrollAction, ) - async def scroll(params: ScrollAction, context: WechatContext): + async def scroll_up(params: ScrollAction, context: WechatContext): index = params.index element = [item for item in context.state.accessibility_tree_json if item["id"] == index] x, y, w, h = element[0]["bbox"] offset = context.state.offset - x = int(x + offset[0] + w/2) - y = int(y + offset[1] + h/2) - logger.info(f"Scroll element {element[0]}") + x = int(x + offset[0] + w / 2) + y = int(y + offset[1] + h / 2) try: - amount = params.amount if params.amount is not None else 0 # Default to scrolling one page if amount is None - logger.info(f"Scrolling {amount} pixels") - # pyautogui.click(x=x, y=y, clicks=1, interval=0.1, button='left') + amount = -abs(params.amount) if params.amount is not None else -10 + logger.info(f"Scroll element {element[0]} up") pyautogui.moveTo(x, y, duration=0.1) - pyautogui.scroll(amount) # Use pyautogui to scroll by 'amount' pixels - return ActionResult(is_done=False, success=True, extracted_content=f"🌀 Scrolled {amount} pixels", include_in_memory=True) + pyautogui.scroll(amount) + return ActionResult( + is_done=False, + success=True, + extracted_content=f"🌀 Scrolled up {abs(amount)} pixels", + include_in_memory=True, + ) except Exception as e: - logger.error(f"Error scrolling: {e}") - return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True) + logger.error(f"Error scrolling up: {e}") + return ActionResult( + is_done=False, + success=False, + extracted_content=str(e), + include_in_memory=True, + ) + + @self.registry.action( + "Scroll element down", + param_model=ScrollAction, + ) + async def scroll_down(params: ScrollAction, context: WechatContext): + index = params.index + element = [item for item in context.state.accessibility_tree_json if item["id"] == index] + x, y, w, h = element[0]["bbox"] + offset = context.state.offset + x = int(x + offset[0] + w / 2) + y = int(y + offset[1] + h / 2) + try: + amount = abs(params.amount) if params.amount is not None else 10 + logger.info(f"Scroll element {element[0]} down") + pyautogui.moveTo(x, y, duration=0.1) + pyautogui.scroll(amount) + return ActionResult( + is_done=False, + success=True, + extracted_content=f"🌀 Scrolled down {abs(amount)} pixels", + include_in_memory=True, + ) + except Exception as e: + logger.error(f"Error scrolling down: {e}") + return ActionResult( + is_done=False, + success=False, + extracted_content=str(e), + include_in_memory=True, + ) + @self.registry.action( "Paste clipboard text", @@ -237,6 +277,23 @@ async def send(params: SendAction, context: WechatContext): logger.error(f"Error pasting text: {e}") return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True) + @self.registry.action( + "Extract content", + param_model=ExtractContent, + ) + async def extract_content(params: ExtractContent, context: WechatContext): + try: + target = params.target + content = params.content + + logger.info(f"Extracted content based on the requirement '{target}': {content}") + + return ActionResult(is_done=False, success=True, extracted_content=f"📋 Extracted content based on the requirement '{target}': {content}", include_in_memory=True) + + except Exception as e: + logger.error(f"Error extracting content based on the requirement '{target}': {e}") + return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True) + # @self.registry.action( # "Modify clipboard text", # param_model=ModifyClipboardTextAction, diff --git a/macosagent/agents/wechat_agent/controller/views.py b/macosagent/agents/wechat_agent/controller/views.py index 54666a1..336015c 100644 --- a/macosagent/agents/wechat_agent/controller/views.py +++ b/macosagent/agents/wechat_agent/controller/views.py @@ -31,6 +31,10 @@ class CopyAction(BaseModel): class SendAction(BaseModel): index: int +class ExtractContent(BaseModel): + target: str + content: str + # class ModifyClipboardTextAction(BaseModel): # requirement_text: str