Skip to content

Commit 801fc4e

Browse files
google-genai-botcopybara-github
authored andcommitted
feat: Add ComputerUse tool
PiperOrigin-RevId: 871087255
1 parent d24e6cc commit 801fc4e

File tree

9 files changed

+1173
-0
lines changed

9 files changed

+1173
-0
lines changed
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/*
2+
* Copyright 2026 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.adk.tools.computeruse;
18+
19+
import com.google.adk.tools.Annotations.Schema;
20+
import io.reactivex.rxjava3.core.Completable;
21+
import io.reactivex.rxjava3.core.Single;
22+
import java.time.Duration;
23+
import java.util.List;
24+
25+
/**
26+
* Defines an interface for computer environments.
27+
*
28+
* <p>This interface defines the standard methods for controlling computer environments, including
29+
* web browsers and other interactive systems.
30+
*/
31+
public interface BaseComputer {
32+
33+
/** Returns the screen size of the environment. */
34+
Single<int[]> screenSize();
35+
36+
/** Opens the web browser. */
37+
Single<ComputerState> openWebBrowser();
38+
39+
/** Clicks at a specific x, y coordinate on the webpage. */
40+
Single<ComputerState> clickAt(@Schema(name = "x") int x, @Schema(name = "y") int y);
41+
42+
/** Hovers at a specific x, y coordinate on the webpage. */
43+
Single<ComputerState> hoverAt(@Schema(name = "x") int x, @Schema(name = "y") int y);
44+
45+
/** Types text at a specific x, y coordinate. */
46+
Single<ComputerState> typeTextAt(
47+
@Schema(name = "x") int x,
48+
@Schema(name = "y") int y,
49+
@Schema(name = "text") String text,
50+
@Schema(name = "press_enter", optional = true) Boolean pressEnter,
51+
@Schema(name = "clear_before_typing", optional = true) Boolean clearBeforeTyping);
52+
53+
/** Scrolls the entire webpage in a direction. */
54+
Single<ComputerState> scrollDocument(@Schema(name = "direction") String direction);
55+
56+
/** Scrolls at a specific x, y coordinate by magnitude. */
57+
Single<ComputerState> scrollAt(
58+
@Schema(name = "x") int x,
59+
@Schema(name = "y") int y,
60+
@Schema(name = "direction") String direction,
61+
@Schema(name = "magnitude") int magnitude);
62+
63+
/** Waits for specified duration. */
64+
Single<ComputerState> wait(@Schema(name = "duration") Duration duration);
65+
66+
/** Navigates back. */
67+
Single<ComputerState> goBack();
68+
69+
/** Navigates forward. */
70+
Single<ComputerState> goForward();
71+
72+
/** Jumps to search. */
73+
Single<ComputerState> search();
74+
75+
/** Navigates to URL. */
76+
Single<ComputerState> navigate(@Schema(name = "url") String url);
77+
78+
/** Presses key combination. */
79+
Single<ComputerState> keyCombination(@Schema(name = "keys") List<String> keys);
80+
81+
/** Drag and drop. */
82+
Single<ComputerState> dragAndDrop(
83+
@Schema(name = "x") int x,
84+
@Schema(name = "y") int y,
85+
@Schema(name = "destination_x") int destinationX,
86+
@Schema(name = "destination_y") int destinationY);
87+
88+
/** Returns current state. */
89+
Single<ComputerState> currentState();
90+
91+
/** Initialize the computer. */
92+
Completable initialize();
93+
94+
/** Cleanup resources. */
95+
Completable close();
96+
97+
/** Returns the environment. */
98+
Single<ComputerEnvironment> environment();
99+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*
2+
* Copyright 2026 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.adk.tools.computeruse;
18+
19+
/** Enum for computer environments. */
20+
public enum ComputerEnvironment {
21+
ENVIRONMENT_UNSPECIFIED,
22+
ENVIRONMENT_BROWSER
23+
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
* Copyright 2026 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.adk.tools.computeruse;
18+
19+
import com.fasterxml.jackson.annotation.JsonCreator;
20+
import com.fasterxml.jackson.annotation.JsonProperty;
21+
import com.google.errorprone.annotations.CanIgnoreReturnValue;
22+
import java.util.Arrays;
23+
import java.util.Objects;
24+
import java.util.Optional;
25+
26+
/**
27+
* Represents the current state of the computer environment.
28+
*
29+
* <p>Attributes: screenshot: The screenshot in PNG format as bytes. url: The current URL of the
30+
* webpage being displayed.
31+
*/
32+
public final class ComputerState {
33+
private final byte[] screenshot;
34+
private final Optional<String> url;
35+
36+
@JsonCreator
37+
private ComputerState(
38+
@JsonProperty("screenshot") byte[] screenshot, @JsonProperty("url") Optional<String> url) {
39+
this.screenshot = screenshot.clone();
40+
this.url = url;
41+
}
42+
43+
@JsonProperty("screenshot")
44+
public byte[] screenshot() {
45+
return screenshot.clone();
46+
}
47+
48+
@JsonProperty("url")
49+
public Optional<String> url() {
50+
return url;
51+
}
52+
53+
public static Builder builder() {
54+
return new Builder();
55+
}
56+
57+
/** Builder for {@link ComputerState}. */
58+
public static final class Builder {
59+
private byte[] screenshot;
60+
private Optional<String> url = Optional.empty();
61+
62+
@CanIgnoreReturnValue
63+
public Builder screenshot(byte[] screenshot) {
64+
this.screenshot = screenshot.clone();
65+
return this;
66+
}
67+
68+
@CanIgnoreReturnValue
69+
public Builder url(Optional<String> url) {
70+
this.url = url;
71+
return this;
72+
}
73+
74+
@CanIgnoreReturnValue
75+
public Builder url(String url) {
76+
this.url = Optional.ofNullable(url);
77+
return this;
78+
}
79+
80+
public ComputerState build() {
81+
return new ComputerState(screenshot, url);
82+
}
83+
}
84+
85+
public static ComputerState create(byte[] screenshot, String url) {
86+
return builder().screenshot(screenshot).url(url).build();
87+
}
88+
89+
public static ComputerState create(byte[] screenshot) {
90+
return builder().screenshot(screenshot).build();
91+
}
92+
93+
@Override
94+
public boolean equals(Object o) {
95+
if (this == o) {
96+
return true;
97+
}
98+
if (!(o instanceof ComputerState that)) {
99+
return false;
100+
}
101+
return Objects.deepEquals(screenshot, that.screenshot) && Objects.equals(url, that.url);
102+
}
103+
104+
@Override
105+
public int hashCode() {
106+
return Objects.hash(Arrays.hashCode(screenshot), url);
107+
}
108+
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/*
2+
* Copyright 2026 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.adk.tools.computeruse;
18+
19+
import static java.lang.String.format;
20+
21+
import com.google.adk.tools.FunctionTool;
22+
import com.google.adk.tools.ToolContext;
23+
import com.google.common.collect.ImmutableMap;
24+
import io.reactivex.rxjava3.core.Single;
25+
import java.lang.reflect.Method;
26+
import java.util.Base64;
27+
import java.util.HashMap;
28+
import java.util.Map;
29+
import org.slf4j.Logger;
30+
import org.slf4j.LoggerFactory;
31+
32+
/**
33+
* A tool that wraps computer control functions for use with LLMs.
34+
*
35+
* <p>This tool automatically normalizes coordinates from a virtual coordinate space (by default
36+
* 1000x1000) to the actual screen size.
37+
*/
38+
public class ComputerUseTool extends FunctionTool {
39+
40+
private static final Logger logger = LoggerFactory.getLogger(ComputerUseTool.class);
41+
42+
private final int[] screenSize;
43+
private final int[] coordinateSpace;
44+
45+
public ComputerUseTool(Object instance, Method func, int[] screenSize, int[] virtualScreenSize) {
46+
super(instance, func, /* isLongRunning= */ false);
47+
this.screenSize = screenSize;
48+
this.coordinateSpace = virtualScreenSize;
49+
}
50+
51+
private int normalize(Object object, String coordinateName, int index) {
52+
if (!(object instanceof Number number)) {
53+
throw new IllegalArgumentException(format("%s coordinate must be numeric", coordinateName));
54+
}
55+
double coordinate = number.doubleValue();
56+
int normalized = (int) (coordinate / coordinateSpace[index] * screenSize[index]);
57+
// Clamp to screen bounds
58+
int clamped = Math.max(0, Math.min(normalized, screenSize[index] - 1));
59+
logger.atDebug().log(
60+
format(
61+
"%s: %.2f, normalized %s: %d, screen %s size: %d, coordinate-space %s size: %d, "
62+
+ "clamped %s: %d",
63+
coordinateName,
64+
coordinate,
65+
coordinateName,
66+
normalized,
67+
coordinateName,
68+
screenSize[index],
69+
coordinateName,
70+
coordinateSpace[index],
71+
coordinateName,
72+
clamped));
73+
return clamped;
74+
}
75+
76+
private int normalizeX(Object xObj) {
77+
return normalize(xObj, "x", 0);
78+
}
79+
80+
private int normalizeY(Object yObj) {
81+
return normalize(yObj, "y", 1);
82+
}
83+
84+
@Override
85+
public Single<Map<String, Object>> runAsync(Map<String, Object> args, ToolContext toolContext) {
86+
Map<String, Object> normalizedArgs = new HashMap<>(args);
87+
88+
if (args.containsKey("x")) {
89+
normalizedArgs.put("x", normalizeX(args.get("x")));
90+
}
91+
if (args.containsKey("y")) {
92+
normalizedArgs.put("y", normalizeY(args.get("y")));
93+
}
94+
if (args.containsKey("destination_x")) {
95+
normalizedArgs.put("destination_x", normalizeX(args.get("destination_x")));
96+
}
97+
if (args.containsKey("destination_y")) {
98+
normalizedArgs.put("destination_y", normalizeY(args.get("destination_y")));
99+
}
100+
101+
return super.runAsync(normalizedArgs, toolContext)
102+
.map(
103+
result -> {
104+
// If the underlying tool method returned a structure containing a "screenshot" field
105+
// (e.g., a ComputerState object), FunctionTool.runAsync will have converted it to a
106+
// Map. This post-processing step transforms the byte array "screenshot" field into
107+
// an "image" map with a mimetype and Base64 encoded data, as expected by some
108+
// consuming systems.
109+
if (result.containsKey("screenshot") && result.get("screenshot") instanceof byte[]) {
110+
byte[] screenshot = (byte[]) result.get("screenshot");
111+
ImmutableMap<String, Object> imageMap =
112+
ImmutableMap.of(
113+
"mimetype",
114+
"image/png",
115+
"data",
116+
Base64.getEncoder().encodeToString(screenshot));
117+
Map<String, Object> finalResult = new HashMap<>(result);
118+
finalResult.remove("screenshot");
119+
finalResult.put("image", imageMap);
120+
return finalResult;
121+
}
122+
return result;
123+
});
124+
}
125+
}

0 commit comments

Comments
 (0)