Skip to content

Update wechat_agent actions #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions macosagent/agents/wechat_agent/agent/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,15 @@
- Keep track of the status and subresults in the memory.

9. Action specific rules:
- Scrolling: When performing a scroll action, use the "scroll" action.
- The `amount` parameter follows the rules of `pyautogui.scroll(amount)`:
- A **positive** value scrolls **up**.
- A **negative** value scrolls **down**.
- The **magnitude** of the `amount` value determines the scroll distance.
- The value of the 'amount' parameter should be between -10 and 10 for a single scroll action to ensure it is manageable and does not cause unexpected behavior.
- Scrolling: When performing a scroll action, use either the "scroll_up" or "scroll_down" action depending on direction.
- The `amount` parameter must be a positive integer indicating the scroll distance.
- The direction is determined by the action type:
- `scroll_up`: Scrolls the element up (content moves down)
- `scroll_down`: Scrolls the element down (content moves up)
- Recommended value range for `amount` is between 1 and 10 to ensure smooth behavior.
- Example usages:
- **Scroll up (move page content down)**: `{"scroll": {"index": <target_element_index>, "amount": 10}}`
- **Scroll down (move page content up)**: `{"scroll": {"index": <target_element_index>, "amount": -10}}`
- Scroll up: `{"scroll_up": {"index": <target_element_index>, "amount": 5}}`
- Scroll down: `{"scroll_down": {"index": <target_element_index>, "amount": 5}}`

- Inputting Emojis: When inputting emojis, use the "inputs" action.
- An example format is "{inputs: {'index': <target_element_index>, "text": "😊"}}".
Expand All @@ -92,9 +92,15 @@
- Adding new content:
- Example: `{"inputs": {"index": <target_element_index>, "text": "<existing content> new text"}}`

- Summarizing and Inputting Content:
- When required to summarize content, the agent must first extract key points and then input the summary into the designated field using the "input" action.
- Example: `{"inputs": {"index": <target_element_index>, "text": "<summary of extracted content>"}}`
- Extracting Content: When the task requires analyzing or extracting information from the screen (e.g. summarizing a message, identifying key elements, or fulfilling user-specific requirements), use the "extract_content" action.
- Provide a clear extraction requirement in the "target" field to guide what should be extracted.
- Provide the source text or raw content in the "content" field.
- The result of this action will be stored in memory and included in the reasoning chain for follow-up actions.
- Example:
{"extract_content": {
"target": "Summarize the message for forwarding",
"content": "Hi! Here is the updated schedule: Monday - Team Meeting at 10am; Tuesday - Client call at 2pm. Let me know if you're available."
}}

- Sending Files: When sending files, always assume the file is already in the clipboard.
- Use the "paste" action to paste the file, followed by the "send" action to confirm sending.
Expand Down
83 changes: 70 additions & 13 deletions macosagent/agents/wechat_agent/controller/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from macosagent.agents.wechat_agent.controller.registry.service import Registry
from macosagent.agents.wechat_agent.wechat.context import WechatContext
from macosagent.agents.wechat_agent.controller.views import ClickElementAction, InputAction, DoneAction, ScrollAction, PasteAction, CopyAction, RightClickElementAction, SendAction
from macosagent.agents.wechat_agent.controller.views import ClickElementAction, InputAction, DoneAction, ScrollAction, PasteAction, CopyAction, RightClickElementAction, SendAction, ExtractContent
from macosagent.agents.wechat_agent.agent.views import ActionResult, ActionModel
#from macosagent.agents.wechat_agent.agent.llm import create_gpt_4o_tool

Expand Down Expand Up @@ -136,27 +136,67 @@ async def inputs(params: InputAction, context: WechatContext):
return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True)

@self.registry.action(
"Scroll element",
"Scroll element up",
param_model=ScrollAction,
)
async def scroll(params: ScrollAction, context: WechatContext):
async def scroll_up(params: ScrollAction, context: WechatContext):
index = params.index
element = [item for item in context.state.accessibility_tree_json if item["id"] == index]
x, y, w, h = element[0]["bbox"]
offset = context.state.offset
x = int(x + offset[0] + w/2)
y = int(y + offset[1] + h/2)
logger.info(f"Scroll element {element[0]}")
x = int(x + offset[0] + w / 2)
y = int(y + offset[1] + h / 2)
try:
amount = params.amount if params.amount is not None else 0 # Default to scrolling one page if amount is None
logger.info(f"Scrolling {amount} pixels")
# pyautogui.click(x=x, y=y, clicks=1, interval=0.1, button='left')
amount = -abs(params.amount) if params.amount is not None else -10
logger.info(f"Scroll element {element[0]} up")
pyautogui.moveTo(x, y, duration=0.1)
pyautogui.scroll(amount) # Use pyautogui to scroll by 'amount' pixels
return ActionResult(is_done=False, success=True, extracted_content=f"🌀 Scrolled {amount} pixels", include_in_memory=True)
pyautogui.scroll(amount)
return ActionResult(
is_done=False,
success=True,
extracted_content=f"🌀 Scrolled up {abs(amount)} pixels",
include_in_memory=True,
)
except Exception as e:
logger.error(f"Error scrolling: {e}")
return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True)
logger.error(f"Error scrolling up: {e}")
return ActionResult(
is_done=False,
success=False,
extracted_content=str(e),
include_in_memory=True,
)

@self.registry.action(
"Scroll element down",
param_model=ScrollAction,
)
async def scroll_down(params: ScrollAction, context: WechatContext):
index = params.index
element = [item for item in context.state.accessibility_tree_json if item["id"] == index]
x, y, w, h = element[0]["bbox"]
offset = context.state.offset
x = int(x + offset[0] + w / 2)
y = int(y + offset[1] + h / 2)
try:
amount = abs(params.amount) if params.amount is not None else 10
logger.info(f"Scroll element {element[0]} down")
pyautogui.moveTo(x, y, duration=0.1)
pyautogui.scroll(amount)
return ActionResult(
is_done=False,
success=True,
extracted_content=f"🌀 Scrolled down {abs(amount)} pixels",
include_in_memory=True,
)
except Exception as e:
logger.error(f"Error scrolling down: {e}")
return ActionResult(
is_done=False,
success=False,
extracted_content=str(e),
include_in_memory=True,
)


@self.registry.action(
"Paste clipboard text",
Expand Down Expand Up @@ -237,6 +277,23 @@ async def send(params: SendAction, context: WechatContext):
logger.error(f"Error pasting text: {e}")
return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True)

@self.registry.action(
"Extract content",
param_model=ExtractContent,
)
async def extract_content(params: ExtractContent, context: WechatContext):
try:
target = params.target
content = params.content

logger.info(f"Extracted content based on the requirement '{target}': {content}")

return ActionResult(is_done=False, success=True, extracted_content=f"📋 Extracted content based on the requirement '{target}': {content}", include_in_memory=True)

except Exception as e:
logger.error(f"Error extracting content based on the requirement '{target}': {e}")
return ActionResult(is_done=False, success=False, extracted_content=str(e), include_in_memory=True)

# @self.registry.action(
# "Modify clipboard text",
# param_model=ModifyClipboardTextAction,
Expand Down
4 changes: 4 additions & 0 deletions macosagent/agents/wechat_agent/controller/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ class CopyAction(BaseModel):
class SendAction(BaseModel):
index: int

class ExtractContent(BaseModel):
target: str
content: str

# class ModifyClipboardTextAction(BaseModel):
# requirement_text: str

Expand Down