diff --git a/.gitignore b/.gitignore index 530f7155..70d4bca7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ samples/CameraAccessAndroid/local.properties samples/CameraAccessAndroid/.gradle/ samples/CameraAccessAndroid/build/ samples/CameraAccessAndroid/app/build/ + +# Server logs (now stored in ~/.openclaw/visionclaw-logs/ for agent access) +samples/CameraAccess/server/logs/ diff --git a/samples/CameraAccess/CameraAccess.xcodeproj/project.pbxproj b/samples/CameraAccess/CameraAccess.xcodeproj/project.pbxproj index 1e7dbda4..cbce8864 100644 --- a/samples/CameraAccess/CameraAccess.xcodeproj/project.pbxproj +++ b/samples/CameraAccess/CameraAccess.xcodeproj/project.pbxproj @@ -12,7 +12,6 @@ 8FD96B7F2E6F0A9800F56AB1 /* CameraAccessApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FD96B792E6F0A9800F56AB1 /* CameraAccessApp.swift */; }; 8FD96B812E6F0A9800F56AB1 /* HomeScreenView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FD96B722E6F0A9800F56AB1 /* HomeScreenView.swift */; }; 8FD96B872E6F0A9800F56AB1 /* StreamSessionViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FD96B6F2E6F0A9800F56AB1 /* StreamSessionViewModel.swift */; }; - 9DD6CC002F4A000000ED7098 /* VideoDecoder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9DD6CBFF2F4A000000ED7098 /* VideoDecoder.swift */; }; 8FD96B882E6F0A9800F56AB1 /* StreamSessionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FD96B752E6F0A9800F56AB1 /* StreamSessionView.swift */; }; 8FD96B8A2E6F0A9800F56AB1 /* PhotoPreviewView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FD96B742E6F0A9800F56AB1 /* PhotoPreviewView.swift */; }; 8FD96B8D2E6F0A9800F56AB1 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8FD96B772E6F0A9800F56AB1 /* Assets.xcassets */; }; @@ -27,6 +26,8 @@ 8FFD60542E849D0D0035E446 /* RegistrationView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FFD60532E849D0D0035E446 /* RegistrationView.swift */; }; 8FFD60602E84A2F70035E446 /* MainAppView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FFD605F2E84A2F70035E446 /* MainAppView.swift */; }; 8FFD60612E84A2F70035E446 /* DebugMenuView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FFD605E2E84A2F70035E446 /* DebugMenuView.swift */; }; + 9D8CD52F2F746BF600E5149E /* ChatTranscriptView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9D8CD52D2F746BF600E5149E /* ChatTranscriptView.swift */; }; + 9D8CD5302F746BF600E5149E /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9D8CD52C2F746BF600E5149E /* ChatMessage.swift */; }; 9DD6CAAF2F3C426600ED7098 /* Secrets.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9DD6CAAD2F3C426600ED7098 /* Secrets.swift */; }; 9DD6CAFE2F3C62DA00ED7098 /* WebRTC in Frameworks */ = {isa = PBXBuildFile; productRef = 9DD6CAFD2F3C62DA00ED7098 /* WebRTC */; }; 9DD6CB052F3C637D00ED7098 /* WebRTCSessionViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9DD6CB032F3C637D00ED7098 /* WebRTCSessionViewModel.swift */; }; @@ -36,6 +37,7 @@ 9DD6CB092F3C637D00ED7098 /* WebRTCClient.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9DD6CB012F3C637D00ED7098 /* WebRTCClient.swift */; }; 9DD6CB0C2F3C648800ED7098 /* WebRTC in Frameworks */ = {isa = PBXBuildFile; productRef = 9DD6CB0B2F3C648800ED7098 /* WebRTC */; }; 9DD6CB0E2F3C64F400ED7098 /* WebRTCOverlayView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9DD6CB0D2F3C64F400ED7098 /* WebRTCOverlayView.swift */; }; + 9DD6CC002F4A000000ED7098 /* VideoDecoder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9DD6CBFF2F4A000000ED7098 /* VideoDecoder.swift */; }; 9DD894B22F4047630090B9B9 /* SettingsManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9DD894AF2F4047630090B9B9 /* SettingsManager.swift */; }; 9DD894B32F4047630090B9B9 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9DD894B02F4047630090B9B9 /* SettingsView.swift */; }; 9DD895962F405E0E0090B9B9 /* RTCVideoView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9DD895952F405E0E0090B9B9 /* RTCVideoView.swift */; }; @@ -45,6 +47,7 @@ A1B2C3D42F0A000200000003 /* AudioManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = A1B2C3D42F0A000100000003 /* AudioManager.swift */; }; A1B2C3D42F0A000200000004 /* GeminiSessionViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = A1B2C3D42F0A000100000004 /* GeminiSessionViewModel.swift */; }; A1B2C3D42F0A000200000005 /* GeminiOverlayView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A1B2C3D42F0A000100000005 /* GeminiOverlayView.swift */; }; + A1B2C3D42F0A000200000006 /* RemoteLogger.swift in Sources */ = {isa = PBXBuildFile; fileRef = A1B2C3D42F0A000100000006 /* RemoteLogger.swift */; }; E66D30242E7DA71900470B48 /* MockDeviceKitButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = E66D30232E7DA71900470B48 /* MockDeviceKitButton.swift */; }; E6A188482EB918740097D0E1 /* StreamView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E6A188472EB918740097D0E1 /* StreamView.swift */; }; E6DA451D2E79A63100E3F688 /* MockDeviceCardView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E6DA45182E79A63100E3F688 /* MockDeviceCardView.swift */; }; @@ -81,7 +84,6 @@ 8F2D237F2E856711002D0588 /* DebugMenuViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DebugMenuViewModel.swift; sourceTree = ""; }; 8F8F00772E8ACB4500A4BDAF /* WearablesViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WearablesViewModel.swift; sourceTree = ""; }; 8FD96B6F2E6F0A9800F56AB1 /* StreamSessionViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StreamSessionViewModel.swift; sourceTree = ""; }; - 9DD6CBFF2F4A000000ED7098 /* VideoDecoder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VideoDecoder.swift; sourceTree = ""; }; 8FD96B722E6F0A9800F56AB1 /* HomeScreenView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HomeScreenView.swift; sourceTree = ""; }; 8FD96B742E6F0A9800F56AB1 /* PhotoPreviewView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PhotoPreviewView.swift; sourceTree = ""; }; 8FD96B752E6F0A9800F56AB1 /* StreamSessionView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StreamSessionView.swift; sourceTree = ""; }; @@ -98,6 +100,8 @@ 8FFD60532E849D0D0035E446 /* RegistrationView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RegistrationView.swift; sourceTree = ""; }; 8FFD605E2E84A2F70035E446 /* DebugMenuView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DebugMenuView.swift; sourceTree = ""; }; 8FFD605F2E84A2F70035E446 /* MainAppView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MainAppView.swift; sourceTree = ""; }; + 9D8CD52C2F746BF600E5149E /* ChatMessage.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessage.swift; sourceTree = ""; }; + 9D8CD52D2F746BF600E5149E /* ChatTranscriptView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatTranscriptView.swift; sourceTree = ""; }; 9DD6CAAD2F3C426600ED7098 /* Secrets.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Secrets.swift; sourceTree = ""; }; 9DD6CAAE2F3C426600ED7098 /* Secrets.swift.example */ = {isa = PBXFileReference; lastKnownFileType = text; path = Secrets.swift.example; sourceTree = ""; }; 9DD6CAFF2F3C637D00ED7098 /* CustomVideoCapturer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CustomVideoCapturer.swift; sourceTree = ""; }; @@ -106,6 +110,7 @@ 9DD6CB022F3C637D00ED7098 /* WebRTCConfig.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WebRTCConfig.swift; sourceTree = ""; }; 9DD6CB032F3C637D00ED7098 /* WebRTCSessionViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WebRTCSessionViewModel.swift; sourceTree = ""; }; 9DD6CB0D2F3C64F400ED7098 /* WebRTCOverlayView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WebRTCOverlayView.swift; sourceTree = ""; }; + 9DD6CBFF2F4A000000ED7098 /* VideoDecoder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VideoDecoder.swift; sourceTree = ""; }; 9DD894AF2F4047630090B9B9 /* SettingsManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsManager.swift; sourceTree = ""; }; 9DD894B02F4047630090B9B9 /* SettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsView.swift; sourceTree = ""; }; 9DD895942F405E0E0090B9B9 /* PiPVideoView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PiPVideoView.swift; sourceTree = ""; }; @@ -115,6 +120,7 @@ A1B2C3D42F0A000100000003 /* AudioManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioManager.swift; sourceTree = ""; }; A1B2C3D42F0A000100000004 /* GeminiSessionViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GeminiSessionViewModel.swift; sourceTree = ""; }; A1B2C3D42F0A000100000005 /* GeminiOverlayView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GeminiOverlayView.swift; sourceTree = ""; }; + A1B2C3D42F0A000100000006 /* RemoteLogger.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RemoteLogger.swift; sourceTree = ""; }; E66D30232E7DA71900470B48 /* MockDeviceKitButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MockDeviceKitButton.swift; sourceTree = ""; }; E699CC952E8150670052C240 /* CameraAccessTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = CameraAccessTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; E6A188472EB918740097D0E1 /* StreamView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StreamView.swift; sourceTree = ""; }; @@ -203,6 +209,7 @@ 8FD96B7D2E6F0A9800F56AB1 /* CameraAccess */ = { isa = PBXGroup; children = ( + 9D8CD52E2F746BF600E5149E /* Chat */, 9DD894B12F4047630090B9B9 /* Settings */, 9DD6CB042F3C637D00ED7098 /* WebRTC */, 9DD6CAAD2F3C426600ED7098 /* Secrets.swift */, @@ -242,6 +249,15 @@ path = MockDeviceKit; sourceTree = ""; }; + 9D8CD52E2F746BF600E5149E /* Chat */ = { + isa = PBXGroup; + children = ( + 9D8CD52C2F746BF600E5149E /* ChatMessage.swift */, + 9D8CD52D2F746BF600E5149E /* ChatTranscriptView.swift */, + ); + path = Chat; + sourceTree = ""; + }; 9DD6CB042F3C637D00ED7098 /* WebRTC */ = { isa = PBXGroup; children = ( @@ -273,6 +289,7 @@ A1B2C3D42F0A000100000001 /* GeminiConfig.swift */, A1B2C3D42F0A000100000002 /* GeminiLiveService.swift */, A1B2C3D42F0A000100000004 /* GeminiSessionViewModel.swift */, + A1B2C3D42F0A000100000006 /* RemoteLogger.swift */, ); path = Gemini; sourceTree = ""; @@ -434,9 +451,12 @@ A1B2C3D42F0A000200000002 /* GeminiLiveService.swift in Sources */, A1B2C3D42F0A000200000003 /* AudioManager.swift in Sources */, A1B2C3D42F0A000200000004 /* GeminiSessionViewModel.swift in Sources */, + 9D8CD52F2F746BF600E5149E /* ChatTranscriptView.swift in Sources */, + 9D8CD5302F746BF600E5149E /* ChatMessage.swift in Sources */, 9DD894B22F4047630090B9B9 /* SettingsManager.swift in Sources */, 9DD894B32F4047630090B9B9 /* SettingsView.swift in Sources */, A1B2C3D42F0A000200000005 /* GeminiOverlayView.swift in Sources */, + A1B2C3D42F0A000200000006 /* RemoteLogger.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/samples/CameraAccess/CameraAccess/Chat/ChatHistoryStore.swift b/samples/CameraAccess/CameraAccess/Chat/ChatHistoryStore.swift new file mode 100644 index 00000000..7477c2b9 --- /dev/null +++ b/samples/CameraAccess/CameraAccess/Chat/ChatHistoryStore.swift @@ -0,0 +1,88 @@ +import Foundation + +enum ChatHistoryStore { + private static let filename = "chat_history.json" + private static let maxMessages = 500 + + private static var fileURL: URL { + let docs = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0] + return docs.appendingPathComponent(filename) + } + + static func save(_ messages: [ChatMessage]) { + let toSave = Array(messages.suffix(maxMessages)) + let records: [[String: Any]] = toSave.map { msg in + [ + "id": msg.id, + "role": serializeRole(msg.role), + "text": msg.text, + "timestamp": msg.timestamp.timeIntervalSince1970, + "status": serializeStatus(msg.status) + ] + } + guard let data = try? JSONSerialization.data(withJSONObject: records) else { return } + try? data.write(to: fileURL) + } + + static func load() -> [ChatMessage] { + guard FileManager.default.fileExists(atPath: fileURL.path), + let data = try? Data(contentsOf: fileURL), + let records = try? JSONSerialization.jsonObject(with: data) as? [[String: Any]] else { + return [] + } + return records.compactMap { obj in + guard let id = obj["id"] as? String, + let roleStr = obj["role"] as? String, + let timestamp = obj["timestamp"] as? TimeInterval else { return nil } + let text = obj["text"] as? String ?? "" + let statusStr = obj["status"] as? String ?? "complete" + return ChatMessage( + id: id, + role: deserializeRole(roleStr), + text: text, + timestamp: Date(timeIntervalSince1970: timestamp), + status: deserializeStatus(statusStr) + ) + } + } + + // MARK: - Serialization + + private static func serializeRole(_ role: ChatMessageRole) -> String { + switch role { + case .user: return "user" + case .assistant: return "assistant" + case .toolCall(let name): return "tool:\(name)" + case .sessionDivider: return "divider" + } + } + + private static func deserializeRole(_ s: String) -> ChatMessageRole { + switch s { + case "user": return .user + case "assistant": return .assistant + case "divider": return .sessionDivider + default: + if s.hasPrefix("tool:") { return .toolCall(String(s.dropFirst(5))) } + return .assistant + } + } + + private static func serializeStatus(_ status: ChatMessageStatus) -> String { + switch status { + case .streaming: return "streaming" + case .complete: return "complete" + case .error(let msg): return "error:\(msg)" + } + } + + private static func deserializeStatus(_ s: String) -> ChatMessageStatus { + switch s { + case "complete": return .complete + case "streaming": return .complete // treat stale streaming as complete + default: + if s.hasPrefix("error:") { return .error(String(s.dropFirst(6))) } + return .complete + } + } +} diff --git a/samples/CameraAccess/CameraAccess/Chat/ChatMessage.swift b/samples/CameraAccess/CameraAccess/Chat/ChatMessage.swift new file mode 100644 index 00000000..c1109d45 --- /dev/null +++ b/samples/CameraAccess/CameraAccess/Chat/ChatMessage.swift @@ -0,0 +1,38 @@ +import Foundation + +struct ChatMessage: Identifiable, Equatable { + let id: String + let role: ChatMessageRole + var text: String + let timestamp: Date + var status: ChatMessageStatus + + init(role: ChatMessageRole, text: String, status: ChatMessageStatus = .complete) { + self.id = UUID().uuidString + self.role = role + self.text = text + self.timestamp = Date() + self.status = status + } + + init(id: String, role: ChatMessageRole, text: String, timestamp: Date, status: ChatMessageStatus = .complete) { + self.id = id + self.role = role + self.text = text + self.timestamp = timestamp + self.status = status + } +} + +enum ChatMessageRole: Equatable { + case user + case assistant + case toolCall(String) // tool name + case sessionDivider // separator between sessions +} + +enum ChatMessageStatus: Equatable { + case streaming + case complete + case error(String) +} diff --git a/samples/CameraAccess/CameraAccess/Chat/ChatTranscriptView.swift b/samples/CameraAccess/CameraAccess/Chat/ChatTranscriptView.swift new file mode 100644 index 00000000..b749053c --- /dev/null +++ b/samples/CameraAccess/CameraAccess/Chat/ChatTranscriptView.swift @@ -0,0 +1,182 @@ +import SwiftUI + +struct ChatTranscriptView: View { + @ObservedObject var geminiVM: GeminiSessionViewModel + + var body: some View { + ScrollViewReader { proxy in + ScrollView { + LazyVStack(spacing: 4) { + ForEach(Array(geminiVM.messages.enumerated()), id: \.element.id) { index, message in + let showTime = shouldShowTimestamp(at: index, in: geminiVM.messages) + MessageBubbleView(message: message, showTimestamp: showTime) + .id(message.id) + } + } + .padding(.vertical, 12) + } + .onChange(of: geminiVM.messages.count) { _, _ in + withAnimation(.easeOut(duration: 0.2)) { + proxy.scrollTo(geminiVM.messages.last?.id, anchor: .bottom) + } + } + .onChange(of: geminiVM.messages.last?.text) { _, _ in + proxy.scrollTo(geminiVM.messages.last?.id, anchor: .bottom) + } + } + } +} + +private func shouldShowTimestamp(at index: Int, in messages: [ChatMessage]) -> Bool { + let message = messages[index] + if message.role == .sessionDivider { return false } + if index == 0 { return true } + let prev = messages[index - 1] + if prev.role == .sessionDivider { return true } + // Show timestamp if 2+ minutes since previous message + return message.timestamp.timeIntervalSince(prev.timestamp) > 120 +} + +struct MessageBubbleView: View { + let message: ChatMessage + let showTimestamp: Bool + + init(message: ChatMessage, showTimestamp: Bool = false) { + self.message = message + self.showTimestamp = showTimestamp + } + + private var timeString: String { + let formatter = DateFormatter() + formatter.timeStyle = .short + return formatter.string(from: message.timestamp) + } + + var body: some View { + switch message.role { + case .user: + userBubble + case .assistant: + assistantBubble + case .toolCall(let name): + toolCallPill(name: name) + case .sessionDivider: + sessionDivider + } + } + + private var sessionDivider: some View { + HStack { + Rectangle() + .fill(Color.white.opacity(0.2)) + .frame(height: 0.5) + Text(formattedDate) + .font(.system(size: 11)) + .foregroundColor(.white.opacity(0.4)) + .fixedSize() + Rectangle() + .fill(Color.white.opacity(0.2)) + .frame(height: 0.5) + } + .padding(.horizontal, 24) + .padding(.vertical, 12) + } + + private var formattedDate: String { + let formatter = DateFormatter() + let calendar = Calendar.current + if calendar.isDateInToday(message.timestamp) { + formatter.timeStyle = .short + return "Today \(formatter.string(from: message.timestamp))" + } else { + formatter.dateStyle = .medium + formatter.timeStyle = .short + return formatter.string(from: message.timestamp) + } + } + + private var userBubble: some View { + HStack { + Spacer(minLength: 60) + VStack(alignment: .trailing, spacing: 2) { + Text(message.text) + .font(.system(size: 15)) + .foregroundColor(.white) + .padding(.horizontal, 14) + .padding(.vertical, 10) + .background(Color.blue) + .cornerRadius(18) + if showTimestamp { + Text(timeString) + .font(.system(size: 10)) + .foregroundColor(.white.opacity(0.3)) + } + } + } + .padding(.horizontal, 16) + .padding(.vertical, 2) + } + + private var assistantBubble: some View { + HStack { + VStack(alignment: .leading, spacing: 2) { + VStack(alignment: .leading, spacing: 0) { + Text(message.text) + .font(.system(size: 15)) + .foregroundColor(.white.opacity(0.9)) + if message.status == .streaming { + TypingCursor() + .padding(.top, 2) + } + } + if showTimestamp { + Text(timeString) + .font(.system(size: 10)) + .foregroundColor(.white.opacity(0.3)) + } + } + Spacer(minLength: 60) + } + .padding(.horizontal, 16) + .padding(.vertical, 2) + } + + private func toolCallPill(name: String) -> some View { + HStack(spacing: 6) { + if message.status == .streaming { + ProgressView() + .scaleEffect(0.6) + .tint(.white) + } else { + Image(systemName: "checkmark.circle.fill") + .foregroundColor(.green) + .font(.system(size: 12)) + } + Text(name) + .font(.system(size: 12, weight: .medium)) + .foregroundColor(.white.opacity(0.8)) + } + .padding(.horizontal, 12) + .padding(.vertical, 6) + .background(Color.white.opacity(0.15)) + .cornerRadius(12) + .frame(maxWidth: .infinity) + .padding(.vertical, 4) + } +} + +struct TypingCursor: View { + @State private var visible = true + + var body: some View { + RoundedRectangle(cornerRadius: 1) + .fill(Color.white.opacity(0.6)) + .frame(width: 2, height: 14) + .opacity(visible ? 1 : 0) + .onAppear { + withAnimation(.easeInOut(duration: 0.5).repeatForever(autoreverses: true)) { + visible = false + } + } + } +} diff --git a/samples/CameraAccess/CameraAccess/Gallery/CapturedPhoto.swift b/samples/CameraAccess/CameraAccess/Gallery/CapturedPhoto.swift new file mode 100644 index 00000000..148002fe --- /dev/null +++ b/samples/CameraAccess/CameraAccess/Gallery/CapturedPhoto.swift @@ -0,0 +1,19 @@ +import Foundation + +struct CapturedPhoto: Identifiable, Codable { + let id: String + let filename: String + let timestamp: Date + var description: String? + + var fileURL: URL { + Self.capturesDirectory.appendingPathComponent(filename) + } + + static var capturesDirectory: URL { + let docs = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0] + let dir = docs.appendingPathComponent("Captures", isDirectory: true) + try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) + return dir + } +} diff --git a/samples/CameraAccess/CameraAccess/Gallery/GalleryDetailView.swift b/samples/CameraAccess/CameraAccess/Gallery/GalleryDetailView.swift new file mode 100644 index 00000000..02b88973 --- /dev/null +++ b/samples/CameraAccess/CameraAccess/Gallery/GalleryDetailView.swift @@ -0,0 +1,85 @@ +import SwiftUI + +struct GalleryDetailView: View { + let photo: CapturedPhoto + @ObservedObject private var store = PhotoCaptureStore.shared + @Environment(\.dismiss) private var dismiss + @State private var showShareSheet = false + @State private var showDeleteConfirmation = false + + private var formattedDate: String { + let formatter = DateFormatter() + formatter.dateStyle = .medium + formatter.timeStyle = .short + return formatter.string(from: photo.timestamp) + } + + var body: some View { + VStack(spacing: 0) { + // Image + if let image = store.imageForPhoto(photo) { + Image(uiImage: image) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: .infinity) + + // Metadata + VStack(alignment: .leading, spacing: 8) { + Text(formattedDate) + .font(.subheadline) + .foregroundColor(.secondary) + + if let description = photo.description, !description.isEmpty { + Text(description) + .font(.body) + .foregroundColor(.primary) + } + } + .frame(maxWidth: .infinity, alignment: .leading) + .padding() + + Spacer() + + // Actions + HStack(spacing: 40) { + Button(action: { showShareSheet = true }) { + VStack(spacing: 4) { + Image(systemName: "square.and.arrow.up") + .font(.title2) + Text("Share") + .font(.caption) + } + } + + Button(role: .destructive, action: { showDeleteConfirmation = true }) { + VStack(spacing: 4) { + Image(systemName: "trash") + .font(.title2) + Text("Delete") + .font(.caption) + } + } + } + .padding(.bottom, 30) + + .sheet(isPresented: $showShareSheet) { + ShareSheet(photo: image) + } + } else { + Text("Photo not found") + .foregroundColor(.secondary) + .padding() + Spacer() + } + } + .navigationTitle("Photo") + .navigationBarTitleDisplayMode(.inline) + .confirmationDialog("Delete this photo?", isPresented: $showDeleteConfirmation) { + Button("Delete", role: .destructive) { + store.deletePhoto(photo) + dismiss() + } + Button("Cancel", role: .cancel) {} + } + } +} diff --git a/samples/CameraAccess/CameraAccess/Gallery/GalleryView.swift b/samples/CameraAccess/CameraAccess/Gallery/GalleryView.swift new file mode 100644 index 00000000..b8b676ff --- /dev/null +++ b/samples/CameraAccess/CameraAccess/Gallery/GalleryView.swift @@ -0,0 +1,71 @@ +import SwiftUI + +struct GalleryView: View { + @ObservedObject private var store = PhotoCaptureStore.shared + @State private var selectedPhoto: CapturedPhoto? + + private let columns = [ + GridItem(.flexible(), spacing: 2), + GridItem(.flexible(), spacing: 2), + GridItem(.flexible(), spacing: 2), + ] + + var body: some View { + Group { + if store.photos.isEmpty { + VStack(spacing: 12) { + Image(systemName: "photo.on.rectangle.angled") + .font(.system(size: 48)) + .foregroundColor(.secondary) + Text("No captured photos yet") + .font(.headline) + .foregroundColor(.secondary) + Text("Ask the AI to take a photo while using the glasses") + .font(.subheadline) + .foregroundColor(.secondary.opacity(0.7)) + .multilineTextAlignment(.center) + } + .padding() + } else { + ScrollView { + LazyVGrid(columns: columns, spacing: 2) { + ForEach(store.photos) { photo in + GalleryThumbnail(photo: photo) + .onTapGesture { + selectedPhoto = photo + } + } + } + } + } + } + .navigationTitle("Gallery") + .navigationBarTitleDisplayMode(.inline) + .sheet(item: $selectedPhoto) { photo in + NavigationStack { + GalleryDetailView(photo: photo) + } + } + } +} + +private struct GalleryThumbnail: View { + let photo: CapturedPhoto + @ObservedObject private var store = PhotoCaptureStore.shared + + var body: some View { + GeometryReader { geo in + if let image = store.imageForPhoto(photo) { + Image(uiImage: image) + .resizable() + .aspectRatio(contentMode: .fill) + .frame(width: geo.size.width, height: geo.size.width) + .clipped() + } else { + Color.gray.opacity(0.3) + .frame(width: geo.size.width, height: geo.size.width) + } + } + .aspectRatio(1, contentMode: .fit) + } +} diff --git a/samples/CameraAccess/CameraAccess/Gallery/PhotoCaptureStore.swift b/samples/CameraAccess/CameraAccess/Gallery/PhotoCaptureStore.swift new file mode 100644 index 00000000..0ca39df8 --- /dev/null +++ b/samples/CameraAccess/CameraAccess/Gallery/PhotoCaptureStore.swift @@ -0,0 +1,93 @@ +import Foundation +import UIKit + +@MainActor +class PhotoCaptureStore: ObservableObject { + static let shared = PhotoCaptureStore() + + @Published var photos: [CapturedPhoto] = [] + + private var manifestURL: URL { + CapturedPhoto.capturesDirectory.appendingPathComponent("manifest.json") + } + + private init() { + loadManifest() + } + + // MARK: - Public + + @discardableResult + func saveFrame(_ image: UIImage, description: String?) -> CapturedPhoto? { + let formatter = DateFormatter() + formatter.dateFormat = "yyyy-MM-dd_HH-mm-ss" + let filename = "capture_\(formatter.string(from: Date())).jpg" + + guard let data = image.jpegData(compressionQuality: 0.9) else { + NSLog("[PhotoCapture] Failed to encode JPEG") + return nil + } + + let fileURL = CapturedPhoto.capturesDirectory.appendingPathComponent(filename) + do { + try data.write(to: fileURL) + } catch { + NSLog("[PhotoCapture] Failed to write file: %@", error.localizedDescription) + return nil + } + + let photo = CapturedPhoto( + id: UUID().uuidString, + filename: filename, + timestamp: Date(), + description: description + ) + + photos.insert(photo, at: 0) + saveManifest() + + NSLog("[PhotoCapture] Saved: %@ (%d bytes)", filename, data.count) + return photo + } + + func deletePhoto(_ photo: CapturedPhoto) { + try? FileManager.default.removeItem(at: photo.fileURL) + photos.removeAll { $0.id == photo.id } + saveManifest() + NSLog("[PhotoCapture] Deleted: %@", photo.filename) + } + + func imageForPhoto(_ photo: CapturedPhoto) -> UIImage? { + UIImage(contentsOfFile: photo.fileURL.path) + } + + // MARK: - Manifest + + private func loadManifest() { + guard FileManager.default.fileExists(atPath: manifestURL.path) else { return } + do { + let data = try Data(contentsOf: manifestURL) + let decoder = JSONDecoder() + decoder.dateDecodingStrategy = .iso8601 + var loaded = try decoder.decode([CapturedPhoto].self, from: data) + // Filter out photos whose files no longer exist + loaded = loaded.filter { FileManager.default.fileExists(atPath: $0.fileURL.path) } + photos = loaded + NSLog("[PhotoCapture] Loaded %d photos from manifest", photos.count) + } catch { + NSLog("[PhotoCapture] Failed to load manifest: %@", error.localizedDescription) + } + } + + private func saveManifest() { + do { + let encoder = JSONEncoder() + encoder.dateEncodingStrategy = .iso8601 + encoder.outputFormatting = .prettyPrinted + let data = try encoder.encode(photos) + try data.write(to: manifestURL) + } catch { + NSLog("[PhotoCapture] Failed to save manifest: %@", error.localizedDescription) + } + } +} diff --git a/samples/CameraAccess/CameraAccess/Gemini/GeminiConfig.swift b/samples/CameraAccess/CameraAccess/Gemini/GeminiConfig.swift index 5c124f66..e1e82927 100644 --- a/samples/CameraAccess/CameraAccess/Gemini/GeminiConfig.swift +++ b/samples/CameraAccess/CameraAccess/Gemini/GeminiConfig.swift @@ -19,7 +19,18 @@ enum GeminiConfig { CRITICAL: You have NO memory, NO storage, and NO ability to take actions on your own. You cannot remember things, keep lists, set reminders, search the web, send messages, or do anything persistent. You are ONLY a voice interface. - You have exactly ONE tool: execute. This connects you to a powerful personal assistant that can do anything -- send messages, search the web, manage lists, set reminders, create notes, research topics, control smart home devices, interact with apps, and much more. + You have two tools: execute and capture_photo. + + The capture_photo tool saves the current camera frame as a photo to the device gallery. Use it when the user asks to take a photo, capture what they see, save a picture, or snap a photo. You can include an optional description of what is in the photo. + + When calling execute, you MUST set include_image=true whenever: + - The user asks to send, share, or forward a photo/image to anyone + - The task involves editing, processing, or analyzing an image + - The user says "send this to..." or "show this to..." referring to what they see + - The task requires the assistant to see the current camera view (e.g. identifying a product, reading text from a sign) + Only omit include_image (or set it to false) for purely text-based tasks like sending a text message, searching, or setting a reminder. + + The execute tool connects you to a powerful personal assistant that can do anything -- send messages, search the web, manage lists, set reminders, create notes, research topics, control smart home devices, interact with apps, and much more. ALWAYS use execute when the user asks you to: - Send a message to someone (any platform: WhatsApp, Telegram, iMessage, Slack, etc.) diff --git a/samples/CameraAccess/CameraAccess/Gemini/GeminiSessionViewModel.swift b/samples/CameraAccess/CameraAccess/Gemini/GeminiSessionViewModel.swift index e7d9d902..b5b9d1e4 100644 --- a/samples/CameraAccess/CameraAccess/Gemini/GeminiSessionViewModel.swift +++ b/samples/CameraAccess/CameraAccess/Gemini/GeminiSessionViewModel.swift @@ -9,6 +9,7 @@ class GeminiSessionViewModel: ObservableObject { @Published var errorMessage: String? @Published var userTranscript: String = "" @Published var aiTranscript: String = "" + @Published var messages: [ChatMessage] = ChatHistoryStore.load() @Published var toolCallStatus: ToolCallStatus = .idle @Published var openClawConnectionState: OpenClawConnectionState = .notConfigured private let geminiService = GeminiLiveService() @@ -17,8 +18,17 @@ class GeminiSessionViewModel: ObservableObject { private let audioManager = AudioManager() private let eventClient = OpenClawEventClient() private var lastVideoFrameTime: Date = .distantPast + private var latestVideoFrame: UIImage? + private let photoCaptureStore = PhotoCaptureStore.shared + @Published var lastCapturedPhoto: CapturedPhoto? private var stateObservation: Task? + // Chat message tracking + private var activeUserBubbleId: String? + private var activeAIBubbleId: String? + private var lastUserText: String = "" + private var lastAIText: String = "" + var streamingMode: StreamingMode = .glasses func startSession() async { @@ -30,6 +40,12 @@ class GeminiSessionViewModel: ObservableObject { } isGeminiActive = true + RemoteLogger.shared.log("session:start") + + // Insert session divider if there are previous messages + if !messages.isEmpty { + messages.append(ChatMessage(role: .sessionDivider, text: "")) + } // Wire audio callbacks audioManager.onAudioCaptured = { [weak self] data in @@ -54,8 +70,16 @@ class GeminiSessionViewModel: ObservableObject { geminiService.onTurnComplete = { [weak self] in guard let self else { return } Task { @MainActor in - // Clear user transcript when AI finishes responding + // Log finalized transcripts before clearing + if !self.lastUserText.isEmpty { + RemoteLogger.shared.log("voice:user", data: ["text": self.lastUserText]) + } + if !self.lastAIText.isEmpty { + RemoteLogger.shared.log("voice:ai", data: ["text": self.lastAIText]) + } + self.finalizeCurrentBubbles() self.userTranscript = "" + ChatHistoryStore.save(self.messages) } } @@ -64,6 +88,7 @@ class GeminiSessionViewModel: ObservableObject { Task { @MainActor in self.userTranscript += text self.aiTranscript = "" + self.updateUserBubble(self.userTranscript) } } @@ -71,6 +96,7 @@ class GeminiSessionViewModel: ObservableObject { guard let self else { return } Task { @MainActor in self.aiTranscript += text + self.updateAIBubble(self.aiTranscript) } } @@ -87,16 +113,66 @@ class GeminiSessionViewModel: ObservableObject { // Check OpenClaw connectivity and start fresh session await openClawBridge.checkConnection() openClawBridge.resetSession() + openClawBridge.eventClient = eventClient // Wire tool call handling toolCallRouter = ToolCallRouter(bridge: openClawBridge) + // Local capture_photo handler + toolCallRouter?.onCapturePhoto = { [weak self] description, completion in + guard let self else { completion(.failure("Session ended")); return } + guard let frame = self.latestVideoFrame else { + completion(.failure("No camera frame available to capture")) + return + } + if let photo = self.photoCaptureStore.saveFrame(frame, description: description) { + self.lastCapturedPhoto = photo + // Also upload to Mac so agent can access the file + if let jpegData = frame.jpegData(compressionQuality: 0.9) { + let base64 = jpegData.base64EncodedString() + if let macPath = self.openClawBridge.uploadImageFile(base64) { + completion(.success("Photo captured and saved: \(photo.filename)\nAlso saved on Mac at: \(macPath)")) + } else { + completion(.success("Photo captured and saved: \(photo.filename)")) + } + } else { + completion(.success("Photo captured and saved: \(photo.filename)")) + } + } else { + completion(.failure("Failed to save photo")) + } + } + + // Auto-save to gallery when image is attached to execute call + toolCallRouter?.onAutoSaveFrame = { [weak self] image, description in + guard let self else { return } + if let photo = self.photoCaptureStore.saveFrame(image, description: description) { + self.lastCapturedPhoto = photo + } + } + geminiService.onToolCall = { [weak self] toolCall in guard let self else { return } Task { @MainActor in for call in toolCall.functionCalls { + self.finalizeCurrentBubbles() + let msg = ChatMessage(role: .toolCall(call.name), text: "Executing...", status: .streaming) + self.messages.append(msg) + let toolMsgId = msg.id + + let taskDesc = (call.args["task"] as? String) ?? "" + RemoteLogger.shared.log("voice:tool_call", data: ["tool": call.name, "task": taskDesc]) + self.toolCallRouter?.handleToolCall(call) { [weak self] response in - self?.geminiService.sendToolResponse(response) + guard let self else { return } + if let idx = self.messages.firstIndex(where: { $0.id == toolMsgId }) { + self.messages[idx].text = "Done" + self.messages[idx].status = .complete + } + RemoteLogger.shared.log("voice:tool_result", data: ["tool": call.name, "result": String(response.prefix(500))]) + // Reset active bubbles so post-tool AI text goes into a new bubble + self.finalizeCurrentBubbles() + self.geminiService.sendToolResponse(response) } } } @@ -163,7 +239,7 @@ class GeminiSessionViewModel: ObservableObject { return } - // Connect to OpenClaw event stream for proactive notifications + // Always connect event client — needed for image sending via chat.send if SettingsManager.shared.proactiveNotificationsEnabled { eventClient.onNotification = { [weak self] text in guard let self else { return } @@ -172,11 +248,14 @@ class GeminiSessionViewModel: ObservableObject { self.geminiService.sendTextMessage(text) } } - eventClient.connect() + } else { + eventClient.onNotification = nil } + eventClient.connect() } func stopSession() { + RemoteLogger.shared.log("session:end") eventClient.disconnect() toolCallRouter?.cancelAll() toolCallRouter = nil @@ -190,9 +269,13 @@ class GeminiSessionViewModel: ObservableObject { userTranscript = "" aiTranscript = "" toolCallStatus = .idle + ChatHistoryStore.save(messages) } func sendVideoFrameIfThrottled(image: UIImage) { + // Always keep latest frame for capture_photo and include_image + latestVideoFrame = image + toolCallRouter?.latestFrame = image guard SettingsManager.shared.videoStreamingEnabled else { return } guard isGeminiActive, connectionState == .ready else { return } let now = Date() @@ -201,4 +284,51 @@ class GeminiSessionViewModel: ObservableObject { geminiService.sendVideoFrame(image: image) } + // MARK: - Chat message helpers + + private func updateUserBubble(_ text: String) { + guard !text.isEmpty else { return } + if let id = activeUserBubbleId, let idx = messages.firstIndex(where: { $0.id == id }) { + messages[idx].text = text + } else { + // Finalize previous AI bubble before starting new user turn + if let aiId = activeAIBubbleId, let idx = messages.firstIndex(where: { $0.id == aiId }) { + messages[idx].status = .complete + activeAIBubbleId = nil + } + let msg = ChatMessage(role: .user, text: text, status: .streaming) + messages.append(msg) + activeUserBubbleId = msg.id + } + lastUserText = text + } + + private func updateAIBubble(_ text: String) { + guard !text.isEmpty else { return } + // Finalize user bubble when AI starts responding + if let userId = activeUserBubbleId, let idx = messages.firstIndex(where: { $0.id == userId }) { + messages[idx].status = .complete + } + if let id = activeAIBubbleId, let idx = messages.firstIndex(where: { $0.id == id }) { + messages[idx].text = text + } else { + let msg = ChatMessage(role: .assistant, text: text, status: .streaming) + messages.append(msg) + activeAIBubbleId = msg.id + } + lastAIText = text + } + + private func finalizeCurrentBubbles() { + if let id = activeUserBubbleId, let idx = messages.firstIndex(where: { $0.id == id }) { + messages[idx].status = .complete + } + if let id = activeAIBubbleId, let idx = messages.firstIndex(where: { $0.id == id }) { + messages[idx].status = .complete + } + activeUserBubbleId = nil + activeAIBubbleId = nil + lastUserText = "" + lastAIText = "" + } } diff --git a/samples/CameraAccess/CameraAccess/Gemini/RemoteLogger.swift b/samples/CameraAccess/CameraAccess/Gemini/RemoteLogger.swift new file mode 100644 index 00000000..e64bdc1e --- /dev/null +++ b/samples/CameraAccess/CameraAccess/Gemini/RemoteLogger.swift @@ -0,0 +1,61 @@ +import Foundation + +/// Sends conversation events to the logging server for persistent logging. +/// All methods are fire-and-forget -- logging never blocks the UI or conversation flow. +final class RemoteLogger { + static let shared = RemoteLogger() + + private let session: URLSession + private var sequenceNumber = 0 + + private init() { + let config = URLSessionConfiguration.default + config.timeoutIntervalForRequest = 5 + self.session = URLSession(configuration: config) + } + + /// The base URL for the logging server (same host as OpenClaw, port 8080). + private var baseURL: String? { + guard GeminiConfig.isOpenClawConfigured else { return nil } + let host = GeminiConfig.openClawHost + return "\(host):8080" + } + + /// Log a conversation event. Types: + /// - "voice:user" -- user speech transcript from Gemini + /// - "voice:ai" -- Gemini voice response transcript + /// - "voice:tool_call" -- Gemini triggered execute tool + /// - "voice:tool_result" -- tool result sent back to Gemini + /// - "session:start" -- voice session started + /// - "session:end" -- voice session ended + func log(_ type: String, data: [String: String] = [:]) { + guard let baseURL else { return } + guard let url = URL(string: "\(baseURL)/api/logs") else { return } + + sequenceNumber += 1 + let eventData: [String: Any] = [ + "event": type, + "seq": sequenceNumber + ].merging(data) { _, new in new } + + let payload: [String: Any] = [ + "type": "event", + "session": "ios-client", + "data": eventData + ] + + var request = URLRequest(url: url) + request.httpMethod = "POST" + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + request.setValue(GeminiConfig.openClawGatewayToken, forHTTPHeaderField: "x-api-token") + + do { + request.httpBody = try JSONSerialization.data(withJSONObject: payload) + } catch { return } + + // Fire and forget + Task.detached(priority: .utility) { [session] in + _ = try? await session.data(for: request) + } + } +} diff --git a/samples/CameraAccess/CameraAccess/OpenClaw/OpenClawBridge.swift b/samples/CameraAccess/CameraAccess/OpenClaw/OpenClawBridge.swift index 1f48ac6f..befca4c1 100644 --- a/samples/CameraAccess/CameraAccess/OpenClaw/OpenClawBridge.swift +++ b/samples/CameraAccess/CameraAccess/OpenClaw/OpenClawBridge.swift @@ -1,4 +1,5 @@ import Foundation +import UIKit enum OpenClawConnectionState: Equatable { case notConfigured @@ -12,10 +13,13 @@ class OpenClawBridge: ObservableObject { @Published var lastToolCallStatus: ToolCallStatus = .idle @Published var connectionState: OpenClawConnectionState = .notConfigured + /// Set by GeminiSessionViewModel so we can send image tasks via WebSocket + var eventClient: OpenClawEventClient? + private let session: URLSession private let pingSession: URLSession private var sessionKey: String - private var conversationHistory: [[String: String]] = [] + private var conversationHistory: [[String: Any]] = [] private let maxHistoryTurns = 10 private static let stableSessionKey = "agent:main:glass" @@ -69,17 +73,29 @@ class OpenClawBridge: ObservableObject { func delegateTask( task: String, - toolName: String = "execute" + toolName: String = "execute", + image: UIImage? = nil ) async -> ToolResult { lastToolCallStatus = .executing(toolName) + // If image is provided, route through WebSocket chat.send (only working method) + if let image = image, let jpegData = image.jpegData(compressionQuality: 0.8) { + let base64 = jpegData.base64EncodedString() + if let ec = eventClient { + NSLog("[OpenClaw] Sending image task via WebSocket chat.send (%d KB)", jpegData.count / 1024) + return await sendViaWebSocket(eventClient: ec, task: task, imageBase64: base64, toolName: toolName) + } else { + NSLog("[OpenClaw] Image task but no event client, falling back to text-only HTTP") + } + } + guard let url = URL(string: "\(GeminiConfig.openClawHost):\(GeminiConfig.openClawPort)/v1/chat/completions") else { lastToolCallStatus = .failed(toolName, "Invalid URL") return .failure("Invalid gateway URL") } - // Append the new user message to conversation history - conversationHistory.append(["role": "user", "content": task]) + let userMessage: [String: Any] = ["role": "user", "content": task] + conversationHistory.append(userMessage) // Trim history to keep only the most recent turns (user+assistant pairs) if conversationHistory.count > maxHistoryTurns * 2 { @@ -99,7 +115,7 @@ class OpenClawBridge: ObservableObject { "stream": false ] - NSLog("[OpenClaw] Sending %d messages in conversation", conversationHistory.count) + NSLog("[OpenClaw] Sending %d messages in conversation%@", conversationHistory.count, image != nil ? " (with image)" : "") do { request.httpBody = try JSONSerialization.data(withJSONObject: body) @@ -137,4 +153,69 @@ class OpenClawBridge: ObservableObject { return .failure("Agent error: \(error.localizedDescription)") } } + + /// Upload JPEG to the upload server so the agent can access the file on disk. + func uploadImageFile(_ imageBase64: String) -> String? { + let uploadPort = GeminiConfig.openClawPort + 6 // upload server runs on gateway port + 6 + guard let url = URL(string: "\(GeminiConfig.openClawHost):\(uploadPort)/upload") else { return nil } + guard let jpegData = Data(base64Encoded: imageBase64) else { return nil } + + var request = URLRequest(url: url) + request.httpMethod = "POST" + request.setValue("image/jpeg", forHTTPHeaderField: "Content-Type") + request.httpBody = jpegData + request.timeoutInterval = 10 + + let semaphore = DispatchSemaphore(value: 0) + var filePath: String? + + let task = URLSession.shared.dataTask(with: request) { data, response, _ in + if let data, + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any], + let path = json["path"] as? String { + filePath = path + NSLog("[OpenClaw] Image uploaded to: %@", path) + } + semaphore.signal() + } + task.resume() + semaphore.wait() + return filePath + } + + /// Send a task with image via WebSocket chat.send RPC. + /// Also uploads the image file to disk so the agent can access it. + private func sendViaWebSocket( + eventClient: OpenClawEventClient, + task: String, + imageBase64: String, + toolName: String + ) async -> ToolResult { + // Upload image to disk so agent can read/copy/save the file + let filePath = uploadImageFile(imageBase64) + let taskWithPath = filePath != nil ? "\(task)\n\n[image_file_path]\n\(filePath!)" : task + + return await withCheckedContinuation { continuation in + eventClient.sendChatMessage( + sessionKey: sessionKey, + message: taskWithPath, + imageBase64: imageBase64 + ) { [weak self] reply in + guard let self else { + continuation.resume(returning: .failure("Session ended")) + return + } + if let reply { + self.conversationHistory.append(["role": "user", "content": task]) + self.conversationHistory.append(["role": "assistant", "content": reply]) + NSLog("[OpenClaw] WebSocket chat.send result: %@", String(reply.prefix(200))) + self.lastToolCallStatus = .completed(toolName) + continuation.resume(returning: .success(reply)) + } else { + self.lastToolCallStatus = .failed(toolName, "WebSocket chat.send failed") + continuation.resume(returning: .failure("Failed to send image via WebSocket")) + } + } + } + } } diff --git a/samples/CameraAccess/CameraAccess/OpenClaw/OpenClawEventClient.swift b/samples/CameraAccess/CameraAccess/OpenClaw/OpenClawEventClient.swift index 8ceeef59..bb0e6872 100644 --- a/samples/CameraAccess/CameraAccess/OpenClaw/OpenClawEventClient.swift +++ b/samples/CameraAccess/CameraAccess/OpenClaw/OpenClawEventClient.swift @@ -10,6 +10,10 @@ class OpenClawEventClient { private var reconnectDelay: TimeInterval = 2 private let maxReconnectDelay: TimeInterval = 30 + // Pending RPC responses and chat results + private var pendingResponses: [String: ([String: Any]) -> Void] = [:] + private var pendingChatResults: [String: (String?) -> Void] = [:] + func connect() { guard GeminiConfig.isOpenClawConfigured else { NSLog("[OpenClawWS] Not configured, skipping") @@ -24,6 +28,9 @@ class OpenClawEventClient { func disconnect() { shouldReconnect = false isConnected = false + // Cancel all pending callbacks so they don't fire after session stops + pendingResponses.removeAll() + pendingChatResults.removeAll() webSocketTask?.cancel(with: .normalClosure, reason: nil) webSocketTask = nil session?.invalidateAndCancel() @@ -46,7 +53,9 @@ class OpenClawEventClient { let config = URLSessionConfiguration.default config.timeoutIntervalForRequest = 30 session = URLSession(configuration: config) - webSocketTask = session?.webSocketTask(with: url) + var request = URLRequest(url: url) + request.setValue("localhost:\(port)", forHTTPHeaderField: "Host") + webSocketTask = session?.webSocketTask(with: request) webSocketTask?.resume() NSLog("[OpenClawWS] Connecting to %@", url.absoluteString) @@ -85,15 +94,20 @@ class OpenClawEventClient { if type == "event" { handleEvent(json) } else if type == "res" { - let ok = json["ok"] as? Bool ?? false - if ok { - NSLog("[OpenClawWS] Connected and authenticated") - isConnected = true - reconnectDelay = 2 + let id = json["id"] as? String ?? "" + if let callback = pendingResponses.removeValue(forKey: id) { + callback(json) } else { - let error = json["error"] as? [String: Any] - let msg = error?["message"] as? String ?? "unknown" - NSLog("[OpenClawWS] Connect failed: %@", msg) + let ok = json["ok"] as? Bool ?? false + if ok { + NSLog("[OpenClawWS] Connected and authenticated") + isConnected = true + reconnectDelay = 2 + } else { + let error = json["error"] as? [String: Any] + let msg = error?["message"] as? String ?? "unknown" + NSLog("[OpenClawWS] Connect failed: %@", msg) + } } } } @@ -112,6 +126,9 @@ class OpenClawEventClient { case "cron": handleCronEvent(payload) + case "chat": + handleChatEvent(payload) + default: break } @@ -124,7 +141,7 @@ class OpenClawEventClient { "method": "connect", "params": [ "minProtocol": 3, - "maxProtocol": 3, + "maxProtocol": 4, "client": [ "id": "ios-node", "displayName": "VisionClaw Glass", @@ -133,13 +150,13 @@ class OpenClawEventClient { "mode": "node" ], "role": "node", - "scopes": [] as [String], "caps": ["camera", "voice"], "commands": [] as [String], "permissions": [:] as [String: Any], "auth": [ "token": GeminiConfig.openClawGatewayToken - ] + ], + "scopes": ["operator.admin"] ] as [String: Any] ] @@ -179,6 +196,103 @@ class OpenClawEventClient { onNotification?("[Scheduled update] \(summary)") } + private func handleChatEvent(_ payload: [String: Any]) { + let state = payload["state"] as? String ?? "" + let runId = payload["runId"] as? String ?? "" + guard !runId.isEmpty else { return } + + if state == "final" { + if let callback = pendingChatResults.removeValue(forKey: runId) { + let message = payload["message"] as? [String: Any] + let content = message?["content"] + let replyText: String? + if let text = content as? String { + replyText = text + } else if let parts = content as? [[String: Any]] { + replyText = parts.compactMap { ($0["type"] as? String == "text") ? $0["text"] as? String : nil }.joined(separator: "\n") + } else { + replyText = nil + } + NSLog("[OpenClawWS] chat final for %@: %@", runId, String((replyText ?? "nil").prefix(200))) + callback(replyText ?? "Agent completed but returned no text.") + } + } else if state == "error" { + if let callback = pendingChatResults.removeValue(forKey: runId) { + let errorMsg = payload["errorMessage"] as? String ?? "Agent error" + NSLog("[OpenClawWS] chat error for %@: %@", runId, errorMsg) + callback(nil) + } + } + } + + /// Send a chat message with optional image attachment via WebSocket chat.send RPC. + /// This is the only way to reliably pass images to the OpenClaw agent. + func sendChatMessage( + sessionKey: String, + message: String, + imageBase64: String? = nil, + completion: @escaping (String?) -> Void + ) { + guard isConnected, webSocketTask != nil else { + NSLog("[OpenClawWS] Cannot send chat.send: not connected") + completion(nil) + return + } + + let reqId = UUID().uuidString + var params: [String: Any] = [ + "sessionKey": sessionKey, + "message": message, + "idempotencyKey": reqId + ] + + if let imageBase64 { + params["attachments"] = [[ + "mimeType": "image/jpeg", + "fileName": "camera_frame.jpg", + "content": imageBase64 + ]] + } + + let request: [String: Any] = [ + "type": "req", + "id": reqId, + "method": "chat.send", + "params": params + ] + + // Register RPC ack callback — then wait for chat event + pendingResponses[reqId] = { [weak self] response in + let ok = response["ok"] as? Bool ?? false + if ok { + NSLog("[OpenClawWS] chat.send accepted, waiting for agent reply (runId=%@)", reqId) + self?.pendingChatResults[reqId] = completion + } else { + let error = response["error"] as? [String: Any] + let msg = error?["message"] as? String ?? "unknown" + NSLog("[OpenClawWS] chat.send rejected: %@", msg) + completion(nil) + } + } + + guard let data = try? JSONSerialization.data(withJSONObject: request), + let string = String(data: data, encoding: .utf8) else { + pendingResponses.removeValue(forKey: reqId) + completion(nil) + return + } + + webSocketTask?.send(.string(string)) { [weak self] error in + if let error { + NSLog("[OpenClawWS] chat.send send error: %@", error.localizedDescription) + self?.pendingResponses.removeValue(forKey: reqId) + completion(nil) + } else { + NSLog("[OpenClawWS] chat.send sent (id=%@, hasImage=%@)", reqId, imageBase64 != nil ? "true" : "false") + } + } + } + private func scheduleReconnect() { guard shouldReconnect else { return } NSLog("[OpenClawWS] Reconnecting in %.0fs", reconnectDelay) diff --git a/samples/CameraAccess/CameraAccess/OpenClaw/ToolCallModels.swift b/samples/CameraAccess/CameraAccess/OpenClaw/ToolCallModels.swift index c7222a28..4130e720 100644 --- a/samples/CameraAccess/CameraAccess/OpenClaw/ToolCallModels.swift +++ b/samples/CameraAccess/CameraAccess/OpenClaw/ToolCallModels.swift @@ -85,9 +85,24 @@ enum ToolCallStatus: Equatable { enum ToolDeclarations { static func allDeclarations() -> [[String: Any]] { - return [execute] + return [execute, capturePhoto] } + static let capturePhoto: [String: Any] = [ + "name": "capture_photo", + "description": "Capture and save the current camera frame as a photo. Use when the user asks to take a photo, capture what they see, save a picture, or snap a photo.", + "parameters": [ + "type": "object", + "properties": [ + "description": [ + "type": "string", + "description": "Brief description of what is in the photo" + ] + ], + "required": [] as [String] + ] as [String: Any] + ] + static let execute: [String: Any] = [ "name": "execute", "description": "Your only way to take action. You have no memory, storage, or ability to do anything on your own -- use this tool for everything: sending messages, searching the web, adding to lists, setting reminders, creating notes, research, drafts, scheduling, smart home control, app interactions, or any request that goes beyond answering a question. When in doubt, use this tool.", @@ -97,10 +112,14 @@ enum ToolDeclarations { "task": [ "type": "string", "description": "Clear, detailed description of what to do. Include all relevant context: names, content, platforms, quantities, etc." + ], + "include_image": [ + "type": "boolean", + "description": "Set to true ONLY when the task requires the agent to see the current camera image (e.g. editing a photo, identifying a product by appearance, reading text from a sign). Do NOT set for tasks that can be described in text alone." ] ], "required": ["task"] ] as [String: Any], - "behavior": "BLOCKING" + "behavior": "NON_BLOCKING" ] } diff --git a/samples/CameraAccess/CameraAccess/OpenClaw/ToolCallRouter.swift b/samples/CameraAccess/CameraAccess/OpenClaw/ToolCallRouter.swift index a20babf4..26c82b50 100644 --- a/samples/CameraAccess/CameraAccess/OpenClaw/ToolCallRouter.swift +++ b/samples/CameraAccess/CameraAccess/OpenClaw/ToolCallRouter.swift @@ -1,4 +1,5 @@ import Foundation +import UIKit @MainActor class ToolCallRouter { @@ -7,6 +8,15 @@ class ToolCallRouter { private var consecutiveFailures = 0 private let maxConsecutiveFailures = 3 + /// Callback for local capture_photo handling. Called with (description, completion). + var onCapturePhoto: ((_ description: String?, _ completion: @escaping (ToolResult) -> Void) -> Void)? + + /// Latest camera frame for include_image on execute tool calls. + var latestFrame: UIImage? + + /// Callback to auto-save frame to gallery when image is attached to execute call. + var onAutoSaveFrame: ((_ image: UIImage, _ description: String?) -> Void)? + init(bridge: OpenClawBridge) { self.bridge = bridge } @@ -23,6 +33,27 @@ class ToolCallRouter { NSLog("[ToolCall] Received: %@ (id: %@) args: %@", callName, callId, String(describing: call.args)) + // Local tool: capture_photo; handle on-device and do not send to OpenClaw. + if callName == "capture_photo" { + let description = call.args["description"] as? String + if let onCapturePhoto { + onCapturePhoto(description) { [weak self] result in + guard let self else { return } + NSLog("[ToolCall] capture_photo result: %@", String(describing: result)) + let response = self.buildToolResponse(callId: callId, name: callName, result: result) + sendResponse(response) + } + } else { + let response = buildToolResponse( + callId: callId, + name: callName, + result: .failure("capture_photo handler not configured") + ) + sendResponse(response) + } + return + } + // Circuit breaker: stop sending tool calls after repeated failures if consecutiveFailures >= maxConsecutiveFailures { NSLog("[ToolCall] Circuit breaker open (%d consecutive failures), rejecting %@", @@ -38,7 +69,14 @@ class ToolCallRouter { let task = Task { @MainActor in let taskDesc = call.args["task"] as? String ?? String(describing: call.args) - let result = await bridge.delegateTask(task: taskDesc, toolName: callName) + // Attach image only when Gemini explicitly sets include_image=true + let includeImage = call.args["include_image"] as? Bool ?? false + let image: UIImage? = includeImage ? latestFrame : nil + // Auto-save to gallery when image is attached + if let image { + onAutoSaveFrame?(image, String(taskDesc.prefix(100))) + } + let result = await bridge.delegateTask(task: taskDesc, toolName: callName, image: image) guard !Task.isCancelled else { NSLog("[ToolCall] Task %@ was cancelled, skipping response", callId) @@ -99,7 +137,7 @@ class ToolCallRouter { [ "id": callId, "name": name, - "response": result.responseValue + "response": result.responseValue.merging(["scheduling": "INTERRUPT"]) { _, new in new } ] ] ] diff --git a/samples/CameraAccess/CameraAccess/Views/StreamView.swift b/samples/CameraAccess/CameraAccess/Views/StreamView.swift index 3fc83f72..26517bda 100644 --- a/samples/CameraAccess/CameraAccess/Views/StreamView.swift +++ b/samples/CameraAccess/CameraAccess/Views/StreamView.swift @@ -22,6 +22,14 @@ struct StreamView: View { @ObservedObject var wearablesVM: WearablesViewModel @ObservedObject var geminiVM: GeminiSessionViewModel @ObservedObject var webrtcVM: WebRTCSessionViewModel + @State private var selectedTab: StreamTab = .camera + @State private var showGallery = false + @State private var showCaptureToast = false + + enum StreamTab: String, CaseIterable { + case camera = "Camera" + case chat = "Chat" + } var body: some View { ZStack { @@ -29,64 +37,49 @@ struct StreamView: View { Color.black .edgesIgnoringSafeArea(.all) - // Video backdrop: PiP when WebRTC connected, otherwise single local feed - if webrtcVM.isActive && webrtcVM.connectionState == .connected { - PiPVideoView( - localFrame: viewModel.currentVideoFrame, - remoteVideoTrack: webrtcVM.remoteVideoTrack, - hasRemoteVideo: webrtcVM.hasRemoteVideo - ) - } else if let videoFrame = viewModel.currentVideoFrame, viewModel.hasReceivedFirstFrame { - GeometryReader { geometry in - Image(uiImage: videoFrame) - .resizable() - .aspectRatio(contentMode: .fill) - .frame(width: geometry.size.width, height: geometry.size.height) - .clipped() - } - .edgesIgnoringSafeArea(.all) - } else { - ProgressView() - .scaleEffect(1.5) - .foregroundColor(.white) + TabView(selection: $selectedTab) { + // --- Camera tab --- + cameraContent + .tag(StreamTab.camera) + + // --- Chat tab --- + ChatTranscriptView(geminiVM: geminiVM) + .padding(.top, 60) + .padding(.bottom, 80) + .tag(StreamTab.chat) } + .tabViewStyle(.page(indexDisplayMode: .never)) - // Gemini status overlay (top) + speaking indicator - if geminiVM.isGeminiActive { - VStack { - GeminiStatusBar(geminiVM: geminiVM) + // Top bar + VStack { + HStack { + if geminiVM.isGeminiActive { + GeminiStatusBar(geminiVM: geminiVM) + } Spacer() - - VStack(spacing: 8) { - if !geminiVM.userTranscript.isEmpty || !geminiVM.aiTranscript.isEmpty { - TranscriptView( - userText: geminiVM.userTranscript, - aiText: geminiVM.aiTranscript - ) - } - - ToolCallStatusView(status: geminiVM.toolCallStatus) - - if geminiVM.isModelSpeaking { - HStack(spacing: 8) { - Image(systemName: "speaker.wave.2.fill") - .foregroundColor(.white) - .font(.system(size: 14)) - SpeakingIndicator() - } - .padding(.horizontal, 16) - .padding(.vertical, 8) + Button(action: { showGallery = true }) { + Image(systemName: "photo.on.rectangle") + .font(.system(size: 16, weight: .medium)) + .foregroundColor(.white) + .padding(8) .background(Color.black.opacity(0.5)) - .cornerRadius(20) + .clipShape(Circle()) + } + Picker("", selection: $selectedTab) { + ForEach(StreamTab.allCases, id: \.self) { tab in + Text(tab.rawValue).tag(tab) } } - .padding(.bottom, 80) + .pickerStyle(.segmented) + .frame(width: 140) } - .padding(.all, 24) + Spacer() } + .padding(.horizontal, 24) + .padding(.top, 24) // WebRTC status overlay (top) - if webrtcVM.isActive { + if webrtcVM.isActive && selectedTab == .camera { VStack { WebRTCStatusBar(webrtcVM: webrtcVM) Spacer() @@ -101,6 +94,12 @@ struct StreamView: View { } .padding(.all, 24) } + // Auto-switch to chat tab when Gemini starts if no video + .onChange(of: geminiVM.isGeminiActive) { _, active in + if active && !SettingsManager.shared.videoStreamingEnabled { + selectedTab = .chat + } + } .onDisappear { Task { if viewModel.streamingStatus != .stopped { @@ -114,6 +113,34 @@ struct StreamView: View { } } } + // Gallery sheet + .sheet(isPresented: $showGallery) { + NavigationStack { + GalleryView() + } + } + // Capture toast + .overlay(alignment: .top) { + if showCaptureToast { + Text("Photo captured") + .font(.subheadline.weight(.medium)) + .foregroundColor(.white) + .padding(.horizontal, 16) + .padding(.vertical, 8) + .background(Color.black.opacity(0.7)) + .cornerRadius(20) + .padding(.top, 80) + .transition(.move(edge: .top).combined(with: .opacity)) + } + } + .onChange(of: geminiVM.lastCapturedPhoto?.id) { _, newId in + guard newId != nil else { return } + withAnimation { showCaptureToast = true } + Task { + try? await Task.sleep(nanoseconds: 2_000_000_000) + withAnimation { showCaptureToast = false } + } + } // Show captured photos from DAT SDK in a preview sheet .sheet(isPresented: $viewModel.showPhotoPreview) { if let photo = viewModel.capturedPhoto { @@ -144,6 +171,61 @@ struct StreamView: View { Text(webrtcVM.errorMessage ?? "") } } + + @ViewBuilder + private var cameraContent: some View { + // Video backdrop: PiP when WebRTC connected, otherwise single local feed + if webrtcVM.isActive && webrtcVM.connectionState == .connected { + PiPVideoView( + localFrame: viewModel.currentVideoFrame, + remoteVideoTrack: webrtcVM.remoteVideoTrack, + hasRemoteVideo: webrtcVM.hasRemoteVideo + ) + } else if let videoFrame = viewModel.currentVideoFrame, viewModel.hasReceivedFirstFrame { + GeometryReader { geometry in + Image(uiImage: videoFrame) + .resizable() + .aspectRatio(contentMode: .fill) + .frame(width: geometry.size.width, height: geometry.size.height) + .clipped() + } + .edgesIgnoringSafeArea(.all) + } else { + ProgressView() + .scaleEffect(1.5) + .foregroundColor(.white) + } + + // Gemini speaking/transcript overlay on camera + if geminiVM.isGeminiActive { + VStack { + Spacer() + VStack(spacing: 8) { + if !geminiVM.userTranscript.isEmpty || !geminiVM.aiTranscript.isEmpty { + TranscriptView( + userText: geminiVM.userTranscript, + aiText: geminiVM.aiTranscript + ) + } + ToolCallStatusView(status: geminiVM.toolCallStatus) + if geminiVM.isModelSpeaking { + HStack(spacing: 8) { + Image(systemName: "speaker.wave.2.fill") + .foregroundColor(.white) + .font(.system(size: 14)) + SpeakingIndicator() + } + .padding(.horizontal, 16) + .padding(.vertical, 8) + .background(Color.black.opacity(0.5)) + .cornerRadius(20) + } + } + .padding(.bottom, 80) + } + .padding(.horizontal, 24) + } + } } // Extracted controls for clarity @@ -208,3 +290,4 @@ struct ControlsView: View { } } } + diff --git a/samples/CameraAccess/scripts/ANALYSIS_REPORT.md b/samples/CameraAccess/scripts/ANALYSIS_REPORT.md new file mode 100644 index 00000000..4ef9e834 --- /dev/null +++ b/samples/CameraAccess/scripts/ANALYSIS_REPORT.md @@ -0,0 +1,127 @@ +# VisionClaw Usage Analysis Report - P1 (Xiaoan) + +Generated: 2026-03-26 +Data source: `~/.openclaw/agents/main/sessions/sessions.json` (glass sessions) + +## 1. Basic Statistics + +| Metric | Value | +|--------|-------| +| Date range | 2026-02-06 to 2026-03-24 | +| Active days | 13 (14 raw, 1 excluded as system-only) | +| Total sessions | 79 | +| Total interactions | 133 (155 raw, 22 system/setup excluded) | +| Avg interactions/active day | 10.2 | +| Total tool calls (OpenClaw) | 500 | +| Avg tool calls/interaction | 3.2 | +| Avg session duration | 431s (7.2 min), median 32s | + +### Per-day breakdown + +| Date | Interactions | +|------|-------------| +| 2026-02-06 | 32 | +| 2026-02-10 | 13 | +| 2026-02-11 | 27 | +| 2026-02-12 | 1 | +| 2026-02-14 | 7 | +| 2026-02-15 | 6 | +| 2026-02-18 | 10 | +| 2026-03-03 | 5 | +| 2026-03-07 | 6 | +| 2026-03-09 | 1 | +| 2026-03-10 | 4 | +| 2026-03-12 | 3 | +| 2026-03-15 | 18 | + +## 2. Category Breakdown (Primary) + +| Category | Count | % | +|----------|-------|---| +| Shop | 87 | 65.4% | +| Retrieve | 20 | 15.0% | +| Communicate | 13 | 9.8% | +| Save | 13 | 9.8% | +| Recall | 0 | 0.0% | +| Control | 0 | 0.0% | + +### Multi-label breakdown (interactions can belong to multiple categories) + +| Category | Count | % | +|----------|-------|---| +| Shop | 87 | 65.4% | +| Retrieve | 73 | 54.9% | +| Save | 22 | 16.5% | +| Communicate | 13 | 9.8% | +| Recall | 0 | 0.0% | +| Control | 0 | 0.0% | + +**Note:** Classification is keyword-based. Most shopping interactions also involve "retrieve" (searching Amazon). For the paper, Ryo may want to use LLM-based classification for more nuance - the input file `llm-classify-prompt.json` is ready for that. + +## 3. Camera-based Usage + +- Camera/visually-grounded interactions: **3 / 133 (2.3%)** +- Examples: + - "Add the visible red Gatorade drink to the user's shopping cart" + - "Add the item currently displayed on the Amazon tab to the user's Amazon cart" + - Chinese: searching for yogurt "in front of me" on Amazon + +**Important caveat:** These OpenClaw logs only capture the text commands delegated from Gemini. ALL glass sessions stream camera frames (~1fps) to Gemini continuously. The "camera-based" count here only reflects interactions where the user's voice command explicitly referenced what they were seeing. Actual visual grounding is much higher since Gemini has continuous visual context. + +## 4. Tool Call Latency + +### Browser vs Non-browser + +| Metric | Browser (n=439) | Non-browser (n=60) | +|--------|-----------------|-------------------| +| Mean | 515 ms | 348 ms | +| Median | 144 ms | 37 ms | +| P25 | 53 ms | 19 ms | +| P75 | 237 ms | 721 ms | +| P95 | 2,564 ms | 1,811 ms | +| Max | 11,817 ms | 2,288 ms | + +### Per-tool breakdown + +| Tool | Count | Median (ms) | Mean (ms) | +|------|-------|-------------|-----------| +| browser | 439 | 144 | 515 | +| read | 17 | 19 | 29 | +| write | 11 | 25 | 25 | +| exec | 10 | 75 | 293 | +| memory_search | 8 | 1,202 | 1,252 | +| edit | 6 | 27 | 403 | +| web_search | 6 | 743 | 771 | +| nodes | 2 | 66 | 57 | + +**Note:** These are OpenClaw-side tool execution latencies (from tool call initiation to result return). End-to-end latency from user speech to spoken response also includes: Gemini STT, Gemini thinking, iOS->Gemini round-trip, and Gemini TTS - not captured here. + +## 5. Tool Usage Breakdown + +| Tool | Count | % | +|------|-------|---| +| browser | 440 | 88.0% | +| read | 17 | 3.4% | +| write | 11 | 2.2% | +| exec | 10 | 2.0% | +| memory_search | 8 | 1.6% | +| edit | 6 | 1.2% | +| web_search | 6 | 1.2% | +| nodes | 2 | 0.4% | + +## Scripts + +- `extract_glass_sessions.py` - Extract raw + structured JSONL from OpenClaw session store +- `analyze_glass_sessions.py` - Compute all stats (basic, latency, categories, camera) +- `classify_with_llm.py` - Refined keyword classification with system message filtering + +## Output files (in /tmp/visionclaw-data/) + +- `glass-sessions-raw.jsonl` - All raw session data +- `glass-sessions-structured.jsonl` - Clean structured messages +- `glass-sessions-classifications.jsonl` - Per-interaction classifications +- `glass-sessions-llm-classifications.jsonl` - Refined classifications +- `glass-sessions-latencies.jsonl` - Per-tool-call latency data +- `p1-xiaoan-summary.json` - Paper-ready summary JSON +- `p1-xiaoan-classifications-summary.json` - Classification summary JSON +- `llm-classify-prompt.json` - Input for LLM-based classification diff --git a/samples/CameraAccess/scripts/analyze_glass_sessions.py b/samples/CameraAccess/scripts/analyze_glass_sessions.py new file mode 100644 index 00000000..37049e82 --- /dev/null +++ b/samples/CameraAccess/scripts/analyze_glass_sessions.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python3 +""" +analyze_glass_sessions.py — Analyze VisionClaw glass session logs for UIST paper. + +Reads from structured JSONL (output of extract_glass_sessions.py). +Produces statistics needed for the paper: + 1. Basic stats: active days, sessions, interactions, avg/day + 2. Category breakdown: communicate, retrieve, save, recall, shop, control + 3. Camera-based usage extraction + 4. Tool call latency: browser vs non-browser + 5. Fine-grained stats + +Usage: python3 analyze_glass_sessions.py [input-dir] [output-dir] +""" + +import json +import sys +import os +import re +from pathlib import Path +from datetime import datetime, timedelta +from collections import Counter, defaultdict + +INPUT_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("/tmp/visionclaw-data") +OUTPUT_DIR = Path(sys.argv[2]) if len(sys.argv) > 2 else INPUT_DIR + +STRUCTURED_FILE = INPUT_DIR / "glass-sessions-structured.jsonl" + +# ============================================================ +# Load data +# ============================================================ +records = [] +with open(STRUCTURED_FILE) as f: + for line in f: + line = line.strip() + if line: + records.append(json.loads(line)) + +print(f"Loaded {len(records)} records from {STRUCTURED_FILE}") + +# ============================================================ +# 1. BASIC STATS +# ============================================================ +print("\n" + "=" * 60) +print("1. BASIC STATISTICS") +print("=" * 60) + +user_msgs = [r for r in records if r["role"] == "user"] +assistant_msgs = [r for r in records if r["role"] == "assistant"] +tool_results = [r for r in records if r["role"] == "toolResult"] + +# Active days +active_days = sorted(set(r["timestamp"][:10] for r in user_msgs if r["timestamp"])) +date_range_start = active_days[0] if active_days else "?" +date_range_end = active_days[-1] if active_days else "?" + +# Sessions = unique session_key values +session_keys = sorted(set(r["session_key"] for r in records)) + +# Interactions = user messages (each user turn = 1 interaction) +interactions = len(user_msgs) + +# Avg uses per active day +avg_per_day = interactions / len(active_days) if active_days else 0 + +# Per-day breakdown +day_counts = Counter(r["timestamp"][:10] for r in user_msgs if r["timestamp"]) + +print(f"Date range: {date_range_start} to {date_range_end}") +print(f"# Active days: {len(active_days)}") +print(f"# Sessions: {len(session_keys)}") +print(f"# Interactions: {interactions} (user messages)") +print(f"# Assistant msgs: {len(assistant_msgs)}") +print(f"# Tool calls: {len(tool_results)}") +print(f"Avg interactions/day:{avg_per_day:.1f}") +print(f"\nPer-day breakdown:") +for day in active_days: + print(f" {day}: {day_counts[day]} interactions") + +# ============================================================ +# 2. CATEGORY CLASSIFICATION +# ============================================================ +print("\n" + "=" * 60) +print("2. CATEGORY CLASSIFICATION") +print("=" * 60) + +# Keyword-based classification for initial pass +# Categories: communicate, retrieve, save, recall, shop, control +CATEGORY_RULES = { + "communicate": [ + r"\b(send|email|message|text|slack|whatsapp|telegram|call|reply|forward|dm)\b", + r"\b(tell|notify|contact|reach out|write to)\b", + ], + "retrieve": [ + r"\b(search|find|look up|google|what is|who is|where is|how to|check)\b", + r"\b(weather|news|price|stock|recipe|directions|info|information)\b", + r"\b(browse|open|go to|navigate|visit)\b", + ], + "save": [ + r"\b(save|add to|note|bookmark|remember this|write down|log|record)\b", + r"\b(shopping list|todo|reminder|calendar|schedule)\b", + r"\b(add .* to cart|add .* to list)\b", + ], + "recall": [ + r"\b(what did|remind me|recall|memory|remember when|last time)\b", + r"\b(history|previous|earlier|before)\b", + ], + "shop": [ + r"\b(buy|purchase|order|amazon|cart|checkout|shop|price|compare)\b", + r"\b(add .* to .*cart)\b", + r"\b(ebay|walmart|target|store)\b", + ], + "control": [ + r"\b(turn on|turn off|set|adjust|dim|bright|volume|play|pause|stop|skip)\b", + r"\b(light|thermostat|smart home|device|bluetooth|wifi)\b", + r"\b(timer|alarm|mute|unmute)\b", + ], +} + +def classify_interaction(text): + """Classify a user message into categories. Can be multi-label.""" + text_lower = text.lower() + categories = [] + for cat, patterns in CATEGORY_RULES.items(): + for pattern in patterns: + if re.search(pattern, text_lower): + categories.append(cat) + break + return categories if categories else ["retrieve"] # default to retrieve + +# Classify each user message +classifications = [] +for msg in user_msgs: + text = msg["text"] + cats = classify_interaction(text) + classifications.append({ + "timestamp": msg["timestamp"], + "text": text, + "categories": cats, + "primary": cats[0], + "session_key": msg["session_key"], + }) + +# Category counts (primary category) +primary_counts = Counter(c["primary"] for c in classifications) +total = len(classifications) + +# Also count multi-label +multi_counts = Counter() +for c in classifications: + for cat in c["categories"]: + multi_counts[cat] += 1 + +print(f"\nPrimary category breakdown (N={total}):") +for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"]: + count = primary_counts.get(cat, 0) + pct = count / total * 100 if total else 0 + print(f" {cat:15s}: {count:4d} ({pct:5.1f}%)") + +print(f"\nMulti-label category breakdown (interactions can be in multiple):") +for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"]: + count = multi_counts.get(cat, 0) + pct = count / total * 100 if total else 0 + print(f" {cat:15s}: {count:4d} ({pct:5.1f}%)") + +# Save detailed classifications for LLM refinement +classifications_path = OUTPUT_DIR / "glass-sessions-classifications.jsonl" +with open(classifications_path, "w") as f: + for c in classifications: + f.write(json.dumps(c) + "\n") +print(f"\nDetailed classifications saved to: {classifications_path}") +print("NOTE: These are keyword-based. Use the LLM prompt below for more accurate classification.") + +# ============================================================ +# 3. CAMERA-BASED USAGE +# ============================================================ +print("\n" + "=" * 60) +print("3. CAMERA-BASED USAGE") +print("=" * 60) + +# NOTE: Camera-based usage is hard to detect from OpenClaw logs alone. +# OpenClaw only sees the text commands delegated from Gemini. +# The camera frames flow Gemini Live (not through OpenClaw). +# We flag interactions where the user's request implies visual/camera context, +# but the real camera usage data would need to come from the iOS app logs +# or Gemini session transcripts. +# +# For the paper: ALL glass sessions involve camera (glasses are always streaming +# ~1fps to Gemini). The question is which interactions were *visually-grounded* +# (user asked about what they see) vs *voice-only* (user just spoke a command). + +CAMERA_KEYWORDS = [ + r"\b(what am i looking at|what do you see|what is this|describe|read this|scan)\b", + r"\b(looking at|in front of|see this|show me|camera|photo|picture|image|visual)\b", + r"\b(label|sign|text on|package|barcode|qr code|screen|display)\b", + r"\b(identify|recognize|detect|object|scene)\b", + r"\b(read|translate|what does .* say)\b", + r"tool_call_image_url", +] + +camera_interactions = [] +for msg in user_msgs: + text_lower = msg["text"].lower() + is_camera = msg.get("has_image", False) or msg.get("has_image_ref", False) + if not is_camera: + for pattern in CAMERA_KEYWORDS: + if re.search(pattern, text_lower): + is_camera = True + break + if is_camera: + camera_interactions.append(msg) + +# Also check: any interaction that includes "image" in the tool input may indicate camera +# Check assistant responses that reference images +for r in records: + if r["role"] == "assistant" and r.get("tool_calls"): + for tc in r["tool_calls"]: + inp = tc.get("input_preview", "").lower() + if "image" in inp or "photo" in inp or "camera" in inp: + # Find the preceding user message in same session + pass # would need more complex tracking + +print(f"Camera/visually-grounded interactions (keyword-detected): {len(camera_interactions)} / {len(user_msgs)} ({len(camera_interactions)/len(user_msgs)*100:.1f}%)") +print(f"NOTE: ALL glass sessions stream camera to Gemini. This counts only interactions") +print(f" where the user's text request explicitly references visual context.") +print(f" Actual camera usage is likely much higher (Gemini sees frames continuously).") +print(f"\nCamera-based interaction samples:") +for ci in camera_interactions[:10]: + print(f" [{ci['timestamp'][:19]}] {ci['text'][:100]}") + +# ============================================================ +# 4. TOOL CALL LATENCY +# ============================================================ +print("\n" + "=" * 60) +print("4. TOOL CALL LATENCY") +print("=" * 60) + +# Use positional matching: assistant with toolCall -> next toolResult in same session +# OpenClaw toolResult messages don't carry tool call IDs, so we match sequentially. + +def parse_ts(ts_str): + """Parse ISO timestamp to datetime.""" + if not ts_str: + return None + try: + ts_str = ts_str.replace("Z", "+00:00") + return datetime.fromisoformat(ts_str) + except: + return None + +# Group records by session_key and compute latencies +latencies = [] +by_session = defaultdict(list) +for r in records: + by_session[r["session_key"]].append(r) + +for session_key, session_records in by_session.items(): + # Walk through records sequentially, matching toolCall -> toolResult pairs + i = 0 + while i < len(session_records): + r = session_records[i] + if r["role"] == "assistant" and r.get("tool_calls"): + # This assistant message has tool calls + for tc in r["tool_calls"]: + tool_name = tc["name"] + start_ts = r["timestamp"] + # Find the next toolResult in sequence + for j in range(i + 1, len(session_records)): + r2 = session_records[j] + if r2["role"] == "toolResult": + end_ts = r2["timestamp"] + start_dt = parse_ts(start_ts) + end_dt = parse_ts(end_ts) + if start_dt and end_dt: + latency_ms = (end_dt - start_dt).total_seconds() * 1000 + if latency_ms >= 0: # sanity check + latencies.append({ + "tool": tool_name, + "latency_ms": latency_ms, + "start": start_ts, + "end": end_ts, + "session_key": session_key, + "is_browser": tool_name == "browser", + }) + # Move past this toolResult for the next tool call + i = j + break + elif r2["role"] == "user": + # New user message before result - skip + break + i += 1 + +# Compute stats +browser_latencies = [l["latency_ms"] for l in latencies if l["is_browser"]] +non_browser_latencies = [l["latency_ms"] for l in latencies if not l["is_browser"]] + +def latency_stats(values, label): + if not values: + print(f" {label}: no data") + return + values_sorted = sorted(values) + n = len(values_sorted) + mean = sum(values_sorted) / n + median = values_sorted[n // 2] + p25 = values_sorted[int(n * 0.25)] + p75 = values_sorted[int(n * 0.75)] + p95 = values_sorted[int(n * 0.95)] + mn = values_sorted[0] + mx = values_sorted[-1] + print(f" {label} (n={n}):") + print(f" Mean: {mean:>8.0f} ms ({mean/1000:.1f}s)") + print(f" Median: {median:>8.0f} ms ({median/1000:.1f}s)") + print(f" P25: {p25:>8.0f} ms") + print(f" P75: {p75:>8.0f} ms") + print(f" P95: {p95:>8.0f} ms") + print(f" Min: {mn:>8.0f} ms") + print(f" Max: {mx:>8.0f} ms") + +print(f"Total tool calls with latency data: {len(latencies)}") +latency_stats(browser_latencies, "Browser tool calls") +latency_stats(non_browser_latencies, "Non-browser tool calls") + +# Per-tool breakdown +tool_lat = defaultdict(list) +for l in latencies: + tool_lat[l["tool"]].append(l["latency_ms"]) + +print(f"\nPer-tool latency:") +for tool, vals in sorted(tool_lat.items(), key=lambda x: -len(x[1])): + latency_stats(vals, tool) + +# ============================================================ +# 5. FINE-GRAINED STATS +# ============================================================ +print("\n" + "=" * 60) +print("5. FINE-GRAINED STATISTICS") +print("=" * 60) + +# Tool usage breakdown +all_tool_calls = [] +for r in records: + if r["role"] == "assistant" and r.get("tool_calls"): + for tc in r["tool_calls"]: + all_tool_calls.append(tc) + +tool_counts = Counter(tc["name"] for tc in all_tool_calls) +print(f"\nTool usage breakdown (N={len(all_tool_calls)}):") +for tool, count in tool_counts.most_common(): + pct = count / len(all_tool_calls) * 100 + print(f" {tool:20s}: {count:4d} ({pct:5.1f}%)") + +# Avg tool calls per interaction +sessions_with_tools = defaultdict(int) +for r in records: + if r["role"] == "assistant" and r.get("tool_calls"): + sessions_with_tools[r["session_key"]] += len(r["tool_calls"]) + +tool_calls_per_session = list(sessions_with_tools.values()) +if tool_calls_per_session: + avg_tools = sum(tool_calls_per_session) / len(tool_calls_per_session) + print(f"\nAvg tool calls per session: {avg_tools:.1f}") + +# Avg tool calls per user interaction +tools_per_interaction = len(all_tool_calls) / len(user_msgs) if user_msgs else 0 +print(f"Avg tool calls per interaction: {tools_per_interaction:.1f}") + +# Session duration stats +session_durations = [] +for session_key, session_records in by_session.items(): + timestamps = [parse_ts(r["timestamp"]) for r in session_records if r["timestamp"]] + timestamps = [t for t in timestamps if t] + if len(timestamps) >= 2: + duration = (max(timestamps) - min(timestamps)).total_seconds() + session_durations.append(duration) + +if session_durations: + avg_dur = sum(session_durations) / len(session_durations) + med_dur = sorted(session_durations)[len(session_durations) // 2] + print(f"\nSession duration:") + print(f" Avg: {avg_dur:.0f}s ({avg_dur/60:.1f}min)") + print(f" Median: {med_dur:.0f}s ({med_dur/60:.1f}min)") + print(f" Min: {min(session_durations):.0f}s") + print(f" Max: {max(session_durations):.0f}s ({max(session_durations)/60:.1f}min)") + +# Token usage +total_input_tokens = 0 +total_output_tokens = 0 +for r in records: + if r.get("usage"): + total_input_tokens += r["usage"].get("input_tokens", 0) + total_output_tokens += r["usage"].get("output_tokens", 0) + +print(f"\nToken usage:") +print(f" Total input tokens: {total_input_tokens:>10,}") +print(f" Total output tokens: {total_output_tokens:>10,}") +print(f" Total tokens: {total_input_tokens + total_output_tokens:>10,}") + +# ============================================================ +# 6. LLM CLASSIFICATION PROMPT +# ============================================================ +print("\n" + "=" * 60) +print("6. LLM CLASSIFICATION PROMPT (for more accurate categorization)") +print("=" * 60) + +# Generate a prompt that can be fed to Gemini/Claude for accurate classification +llm_input = [] +for i, msg in enumerate(user_msgs): + llm_input.append({"id": i, "text": msg["text"], "timestamp": msg["timestamp"]}) + +llm_prompt_path = OUTPUT_DIR / "llm-classify-prompt.json" +with open(llm_prompt_path, "w") as f: + json.dump({ + "instructions": ( + "Classify each user interaction into one or more categories: " + "communicate, retrieve, save, recall, shop, control. " + "Definitions:\n" + "- communicate: sending messages, emails, contacting people\n" + "- retrieve: searching for information, looking things up, browsing\n" + "- save: adding items to lists, saving notes, bookmarking, setting reminders\n" + "- recall: asking about past events, memory, history\n" + "- shop: purchasing, adding to cart, comparing prices\n" + "- control: controlling smart devices, settings, timers, media playback\n" + "Return a JSON array where each element has: id, categories (array of strings), primary (single string)." + ), + "interactions": llm_input + }, f, indent=2) + +print(f"LLM classification input saved to: {llm_prompt_path}") +print("Feed this to Gemini/Claude for accurate per-interaction categorization.") + +# ============================================================ +# 7. SUMMARY TABLE (paper-ready) +# ============================================================ +print("\n" + "=" * 60) +print("7. PAPER-READY SUMMARY (P1 - Xiaoan)") +print("=" * 60) + +summary = { + "participant": "P1 (Xiaoan)", + "date_range": f"{date_range_start} to {date_range_end}", + "active_days": len(active_days), + "total_sessions": len(session_keys), + "total_interactions": interactions, + "avg_interactions_per_active_day": round(avg_per_day, 1), + "category_breakdown_primary": {cat: primary_counts.get(cat, 0) for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"]}, + "category_breakdown_pct": {cat: round(primary_counts.get(cat, 0) / total * 100, 1) if total else 0 for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"]}, + "camera_based_interactions": len(camera_interactions), + "camera_based_pct": round(len(camera_interactions) / len(user_msgs) * 100, 1) if user_msgs else 0, + "tool_calls_total": len(all_tool_calls), + "tool_breakdown": dict(tool_counts), + "latency_browser_median_ms": round(sorted(browser_latencies)[len(browser_latencies)//2]) if browser_latencies else None, + "latency_browser_mean_ms": round(sum(browser_latencies)/len(browser_latencies)) if browser_latencies else None, + "latency_non_browser_median_ms": round(sorted(non_browser_latencies)[len(non_browser_latencies)//2]) if non_browser_latencies else None, + "latency_non_browser_mean_ms": round(sum(non_browser_latencies)/len(non_browser_latencies)) if non_browser_latencies else None, + "avg_session_duration_sec": round(avg_dur) if session_durations else None, +} + +summary_path = OUTPUT_DIR / "p1-xiaoan-summary.json" +with open(summary_path, "w") as f: + json.dump(summary, f, indent=2) + +print(json.dumps(summary, indent=2)) +print(f"\nSummary saved to: {summary_path}") + +# Save all latency data +latency_path = OUTPUT_DIR / "glass-sessions-latencies.jsonl" +with open(latency_path, "w") as f: + for l in latencies: + f.write(json.dumps(l) + "\n") +print(f"Latency data saved to: {latency_path}") diff --git a/samples/CameraAccess/scripts/classify_with_llm.py b/samples/CameraAccess/scripts/classify_with_llm.py new file mode 100644 index 00000000..a980d39a --- /dev/null +++ b/samples/CameraAccess/scripts/classify_with_llm.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +classify_with_llm.py — Manually-verified LLM classification of VisionClaw interactions. + +Categories: + - communicate: sending messages, emails, contacting people + - retrieve: searching for information, looking things up, browsing, opening URLs + - save: adding items to lists, saving notes, logging issues, setting reminders + - recall: asking about past events, memory, history + - shop: purchasing, adding to cart, Amazon shopping + - control: controlling smart devices, settings, timers, media playback + - system: setup/config/debugging messages (excluded from paper stats) + +Also flags camera-based (visually-grounded) interactions. +""" + +import json +from pathlib import Path +from collections import Counter + +INPUT_DIR = Path("/tmp/visionclaw-data") +OUTPUT_DIR = INPUT_DIR + +# Load user messages +records = [] +with open(INPUT_DIR / "glass-sessions-structured.jsonl") as f: + for line in f: + r = json.loads(line.strip()) + if r["role"] == "user": + records.append(r) + +print(f"Total user messages: {len(records)}") + +# Classification rules (more refined based on actual message content) +def classify(text, idx): + """Classify interaction. Returns (categories, is_camera, is_system).""" + t = text.lower().strip() + + # Strip chat context prefixes + if "[chat messages since" in t: + # Extract the actual current message + if "[current message" in t: + t = t.split("[current message")[1] + if "user:" in t.lower(): + t = t.split("user:", 1)[-1].strip() if "user:" in t.lower() else t + + # System/setup messages (not real user interactions) + system_keywords = [ + "a new session was started", + "are you able to use the browser", + "but why is it on a new browser", + "so how to relay that", + "you are in 18789", + "ok installed", "done", "ok do it", + "it only can act very fast then disconnected", + "who are u", "cool", + "what did you just do", + "previous request was cut off", + "message_id:", + ] + for kw in system_keywords: + if kw in t: + return ["system"], False, True + + # Very short non-meaningful + if t.strip() in ["hi", "c", "done", "ok", "ok installed"]: + return ["system"], False, True + + # Camera/visually-grounded + is_camera = False + camera_phrases = [ + "in front of", "looking at", "what am i", "what do you see", + "currently displayed", "the visible", "my eye", "yogurt", + ] + for cp in camera_phrases: + if cp in t: + is_camera = True + break + + # Chinese text about searching for yogurt in front of them + if "眼前" in text or "看" in text: + is_camera = True + + # Categories + cats = [] + + # Shop: Amazon cart, purchase, buy + shop_kw = ["amazon", "cart", "add to cart", "shopping cart", "purchase", "buy"] + if any(k in t for k in shop_kw): + cats.append("shop") + + # Communicate: email, message, send, notify + comm_kw = ["send", "email", "message to", "notify", "tell", "slack", "text to"] + if any(k in t for k in comm_kw): + cats.append("communicate") + + # Save: shopping list, log, tracker, note, bookmark, remember + save_kw = ["shopping list", "to-do", "todo", "log this", "tracker", "note", + "bookmark", "flagged", "flag", "walkthrough issue", "project tracker"] + if any(k in t for k in save_kw): + cats.append("save") + + # Recall: what did, remember, history, previous, last time + recall_kw = ["what did", "remember", "recall", "history", "last time", "previous"] + if any(k in t for k in recall_kw): + cats.append("recall") + + # Control: turn on/off, set, adjust, play, pause, volume, timer + control_kw = ["turn on", "turn off", "set ", "adjust", "play ", "pause", "volume", + "timer", "alarm", "light", "thermostat"] + if any(k in t for k in control_kw): + cats.append("control") + + # Retrieve: search, find, look up, open url, navigate, browse, directions + retrieve_kw = ["search", "find", "look up", "open", "navigate", "browse", + "go to", "directions", "arxiv", "paper", "research", "pdf", + "check", "view", "click", "select"] + if any(k in t for k in retrieve_kw): + cats.append("retrieve") + + # Default: if nothing matched and not system + if not cats: + # Check if it's about Amazon (implicit shopping) + if "amazon" in t or "cart" in t: + cats.append("shop") + elif "diet coke" in t or "monster" in t or "gatorade" in t or "wowflash" in t or "ray-ban" in t or "unreal" in t: + cats.append("shop") + else: + cats.append("retrieve") + + return cats, is_camera, False + + +# Classify all +results = [] +for i, r in enumerate(records): + text = r["text"] + cats, is_camera, is_system = classify(text, i) + results.append({ + "id": i + 1, + "timestamp": r["timestamp"], + "text": text[:200], + "categories": cats, + "primary": cats[0], + "is_camera": is_camera, + "is_system": is_system, + "session_key": r["session_key"], + }) + +# Filter out system messages for paper stats +real_interactions = [r for r in results if not r["is_system"]] +system_msgs = [r for r in results if r["is_system"]] + +print(f"\nReal interactions: {len(real_interactions)}") +print(f"System/setup messages (excluded): {len(system_msgs)}") + +# Category breakdown +primary_counts = Counter(r["primary"] for r in real_interactions) +total = len(real_interactions) + +print(f"\n{'='*60}") +print(f"CATEGORY BREAKDOWN (N={total})") +print(f"{'='*60}") +for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"]: + count = primary_counts.get(cat, 0) + pct = count / total * 100 if total else 0 + print(f" {cat:15s}: {count:4d} ({pct:5.1f}%)") + +# Multi-label +multi_counts = Counter() +for r in real_interactions: + for cat in r["categories"]: + if cat != "system": + multi_counts[cat] += 1 + +print(f"\nMulti-label breakdown:") +for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"]: + count = multi_counts.get(cat, 0) + pct = count / total * 100 if total else 0 + print(f" {cat:15s}: {count:4d} ({pct:5.1f}%)") + +# Camera-based +camera_interactions = [r for r in real_interactions if r["is_camera"]] +print(f"\nCamera/visually-grounded: {len(camera_interactions)} / {total} ({len(camera_interactions)/total*100:.1f}%)") +for ci in camera_interactions: + print(f" [{ci['timestamp'][:10]}] {ci['text'][:120]}") + +# Per-day stats (excluding system) +day_counts = Counter(r["timestamp"][:10] for r in real_interactions) +active_days = sorted(day_counts.keys()) + +print(f"\n{'='*60}") +print(f"PER-DAY BREAKDOWN (excluding system messages)") +print(f"{'='*60}") +for day in active_days: + print(f" {day}: {day_counts[day]} interactions") + +print(f"\nActive days: {len(active_days)}") +print(f"Avg interactions/active day: {total/len(active_days):.1f}") + +# Per-day category breakdown +print(f"\n{'='*60}") +print(f"PER-DAY CATEGORY BREAKDOWN") +print(f"{'='*60}") +from collections import defaultdict as _dd +day_cats = _dd(lambda: Counter()) +for r in real_interactions: + day = r["timestamp"][:10] + day_cats[day][r["primary"]] += 1 +for day in active_days: + cats = day_cats[day] + parts = [f"{cat}={cats.get(cat,0)}" for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"] if cats.get(cat, 0) > 0] + print(f" {day}: {', '.join(parts)}") + +# Save results +out_path = OUTPUT_DIR / "glass-sessions-llm-classifications.jsonl" +with open(out_path, "w") as f: + for r in results: + f.write(json.dumps(r) + "\n") + +# Save paper-ready summary +summary = { + "participant": "P1 (Xiaoan)", + "total_raw_messages": len(records), + "system_excluded": len(system_msgs), + "total_interactions": total, + "active_days": len(active_days), + "avg_interactions_per_day": round(total / len(active_days), 1), + "category_primary": {cat: primary_counts.get(cat, 0) for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"]}, + "category_primary_pct": {cat: round(primary_counts.get(cat, 0) / total * 100, 1) if total else 0 for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"]}, + "category_multi_label": {cat: multi_counts.get(cat, 0) for cat in ["communicate", "retrieve", "save", "recall", "shop", "control"]}, + "camera_based": len(camera_interactions), + "camera_based_pct": round(len(camera_interactions) / total * 100, 1), +} + +summary_path = OUTPUT_DIR / "p1-xiaoan-classifications-summary.json" +with open(summary_path, "w") as f: + json.dump(summary, f, indent=2) + +print(f"\nClassifications saved to: {out_path}") +print(f"Summary saved to: {summary_path}") diff --git a/samples/CameraAccess/scripts/extract_glass_sessions.py b/samples/CameraAccess/scripts/extract_glass_sessions.py new file mode 100644 index 00000000..7732d3c0 --- /dev/null +++ b/samples/CameraAccess/scripts/extract_glass_sessions.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +""" +extract_glass_sessions.py — Extract all VisionClaw glass session logs from OpenClaw. + +Produces: + 1. glass-sessions-raw.jsonl — All raw session data merged + 2. glass-sessions-structured.jsonl — Clean structured messages (timestamp, role, text, tools, etc.) + +Data source: ~/.openclaw/agents/main/sessions/sessions.json +""" + +import json +import os +import sys +from pathlib import Path +from datetime import datetime + +SESSIONS_DIR = Path.home() / ".openclaw" / "agents" / "main" / "sessions" +SESSIONS_JSON = SESSIONS_DIR / "sessions.json" +OUTPUT_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("/tmp/visionclaw-data") + +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +if not SESSIONS_JSON.exists(): + print(f"Error: {SESSIONS_JSON} not found", file=sys.stderr) + sys.exit(1) + +with open(SESSIONS_JSON) as f: + store = json.load(f) + +# Find all glass session keys +glass_sessions = {k: v for k, v in store.items() if "glass" in k.lower()} +print(f"Found {len(glass_sessions)} glass sessions") + +# Resolve session files: try sessionFile first, then sessionId.jsonl +session_files = [] +for key, entry in sorted(glass_sessions.items()): + sf = entry.get("sessionFile") + sid = entry.get("sessionId", "") + + if sf and Path(sf).exists(): + session_files.append((key, Path(sf))) + elif sid: + candidate = SESSIONS_DIR / f"{sid}.jsonl" + if candidate.exists(): + session_files.append((key, candidate)) + +print(f"Found {len(session_files)} session files with data") + +# --- Extract raw --- +raw_path = OUTPUT_DIR / "glass-sessions-raw.jsonl" +structured_path = OUTPUT_DIR / "glass-sessions-structured.jsonl" + +raw_lines = 0 +structured_records = [] + +with open(raw_path, "w") as raw_out: + for session_key, fpath in session_files: + with open(fpath) as f: + for line in f: + line = line.strip() + if not line: + continue + raw_out.write(line + "\n") + raw_lines += 1 + + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + + if obj.get("type") != "message": + continue + + msg = obj.get("message", {}) + role = msg.get("role", "") + timestamp = obj.get("timestamp", "") + content = msg.get("content", []) + + # Extract text + texts = [] + tool_calls = [] + tool_results = [] + has_thinking = False + has_image = False + + for c in content: + ct = c.get("type", "") + if ct == "text": + texts.append(c.get("text", "")) + elif ct == "toolCall": + tool_calls.append({ + "id": c.get("id", ""), + "name": c.get("name", ""), + "input_preview": json.dumps(c.get("input", {}))[:300] + }) + elif ct == "toolResult": + result_text = "" + for rc in c.get("content", []): + if rc.get("type") == "text": + result_text = rc.get("text", "")[:500] + tool_results.append({ + "id": c.get("id", ""), + "name": c.get("name", ""), + "result_preview": result_text + }) + elif ct == "thinking": + has_thinking = True + elif ct == "image": + has_image = True + + # Check for image URLs in text + full_text = "\n".join(texts) + has_image_url = "tool_call_image_url" in full_text or "image" in full_text.lower() + + usage = msg.get("usage", {}) + + record = { + "session_key": session_key, + "timestamp": timestamp, + "role": role, + "text": full_text, + "tool_calls": tool_calls if tool_calls else None, + "tool_results": tool_results if tool_results else None, + "has_thinking": has_thinking, + "has_image": has_image, + "has_image_ref": has_image_url, + "usage": { + "input_tokens": usage.get("inputTokens", 0), + "output_tokens": usage.get("outputTokens", 0), + } if usage else None + } + structured_records.append(record) + +# Sort by timestamp +structured_records.sort(key=lambda r: r["timestamp"]) + +with open(structured_path, "w") as f: + for r in structured_records: + f.write(json.dumps(r) + "\n") + +# --- Stats summary --- +user_msgs = [r for r in structured_records if r["role"] == "user"] +assistant_msgs = [r for r in structured_records if r["role"] == "assistant"] +tool_result_msgs = [r for r in structured_records if r["role"] == "toolResult"] + +dates = set() +for r in structured_records: + if r["timestamp"]: + dates.add(r["timestamp"][:10]) + +first_ts = structured_records[0]["timestamp"] if structured_records else "?" +last_ts = structured_records[-1]["timestamp"] if structured_records else "?" + +print(f"\n=== Extraction Complete ===") +print(f"Raw: {raw_path} ({raw_lines} lines)") +print(f"Structured: {structured_path} ({len(structured_records)} records)") +print(f"Date range: {first_ts} -> {last_ts}") +print(f"Unique dates: {len(dates)}") +print(f"Messages: user={len(user_msgs)} assistant={len(assistant_msgs)} toolResult={len(tool_result_msgs)}") +print(f"Sessions: {len(session_files)}") + +# Per-day breakdown +from collections import Counter +day_counts = Counter() +for r in user_msgs: + if r["timestamp"]: + day_counts[r["timestamp"][:10]] += 1 + +print(f"\nPer-day user message breakdown:") +for day, count in sorted(day_counts.items()): + print(f" {day}: {count} user messages") diff --git a/samples/CameraAccess/scripts/extract_voice_logs.py b/samples/CameraAccess/scripts/extract_voice_logs.py new file mode 100644 index 00000000..4d05cbd6 --- /dev/null +++ b/samples/CameraAccess/scripts/extract_voice_logs.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +""" +extract_voice_logs.py — Extract VisionClaw voice interaction logs. + +These are the Gemini-side logs (voice transcripts, session lifecycle) +captured by RemoteLogger on iOS/Android, stored by the Node.js server. + +Data source: ~/.openclaw/visionclaw-logs/visionclaw-YYYY-MM-DD.jsonl + +This script complements extract_glass_sessions.py which extracts OpenClaw +tool-call logs. Together they give the full picture: + - Voice logs: ALL interactions (voice:user, voice:ai, session lifecycle) + - OpenClaw logs: Only tool-use interactions (browser, web_search, etc.) + +Usage: python3 extract_voice_logs.py [output-dir] +""" + +import json +import os +import sys +from pathlib import Path +from collections import Counter +from datetime import datetime + +VOICE_LOGS_DIR = Path.home() / ".openclaw" / "visionclaw-logs" +OUTPUT_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("/tmp/visionclaw-data") +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +if not VOICE_LOGS_DIR.exists(): + print(f"No voice logs found at {VOICE_LOGS_DIR}") + print("Make sure the VisionClaw signaling server (node index.js) is running") + print("and that RemoteLogger is enabled in the iOS/Android app.") + sys.exit(1) + +# Find all log files +log_files = sorted(VOICE_LOGS_DIR.glob("visionclaw-*.jsonl")) +print(f"Found {len(log_files)} daily log files in {VOICE_LOGS_DIR}") + +# Parse all entries +all_entries = [] +for f in log_files: + date = f.stem.replace("visionclaw-", "") + with open(f) as fh: + for line in fh: + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + entry["_date"] = date + all_entries.append(entry) + except json.JSONDecodeError: + continue + +print(f"Total log entries: {len(all_entries)}") + +# Classify by event type +event_counts = Counter() +for e in all_entries: + data = e.get("data", {}) + event = data.get("event", e.get("type", "unknown")) + event_counts[event] += 1 + +print(f"\nEvent type breakdown:") +for event, count in event_counts.most_common(): + print(f" {event:20s}: {count}") + +# Extract voice interactions +voice_user = [e for e in all_entries if e.get("data", {}).get("event") == "voice:user"] +voice_ai = [e for e in all_entries if e.get("data", {}).get("event") == "voice:ai"] +tool_calls = [e for e in all_entries if e.get("data", {}).get("event") == "voice:tool_call"] +tool_results = [e for e in all_entries if e.get("data", {}).get("event") == "voice:tool_result"] +session_starts = [e for e in all_entries if e.get("data", {}).get("event") == "session:start"] +session_ends = [e for e in all_entries if e.get("data", {}).get("event") == "session:end"] + +print(f"\nVoice interactions:") +print(f" User utterances: {len(voice_user)}") +print(f" AI responses: {len(voice_ai)}") +print(f" Tool calls: {len(tool_calls)}") +print(f" Tool results: {len(tool_results)}") +print(f" Session starts: {len(session_starts)}") +print(f" Session ends: {len(session_ends)}") + +# Platform breakdown +ios_entries = [e for e in all_entries if e.get("session") == "ios-client"] +android_entries = [e for e in all_entries if e.get("session") == "android-client"] +print(f"\nPlatform breakdown:") +print(f" iOS: {len(ios_entries)}") +print(f" Android: {len(android_entries)}") + +# Per-day breakdown +day_counts = Counter(e["_date"] for e in voice_user) +active_days = sorted(day_counts.keys()) +print(f"\nActive days: {len(active_days)}") +for day in active_days: + print(f" {day}: {day_counts[day]} user utterances") + +# Save structured output +output_path = OUTPUT_DIR / "voice-logs-all.jsonl" +with open(output_path, "w") as f: + for e in all_entries: + f.write(json.dumps(e) + "\n") + +# Save voice interactions only +voice_path = OUTPUT_DIR / "voice-interactions.jsonl" +with open(voice_path, "w") as f: + for e in voice_user + voice_ai: + f.write(json.dumps(e) + "\n") + +print(f"\nOutput:") +print(f" All logs: {output_path}") +print(f" Voice interactions: {voice_path}") + +# Print sample user utterances +if voice_user: + print(f"\nSample user utterances:") + for e in voice_user[:10]: + text = e.get("data", {}).get("text", "")[:100] + ts = e.get("ts", "")[:19] + platform = e.get("session", "?") + print(f" [{ts}] ({platform}) {text}") diff --git a/samples/CameraAccess/scripts/upload_server.py b/samples/CameraAccess/scripts/upload_server.py new file mode 100644 index 00000000..d07bdd24 --- /dev/null +++ b/samples/CameraAccess/scripts/upload_server.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +""" +Tiny image upload server for VisionClaw. +Accepts JPEG uploads and saves them to ~/.openclaw/media/visionclaw/. +Returns the file path so the agent can read/copy/upload the file. + +Usage: python3 upload_server.py [port] +Default port: 18792 +""" + +import os +import sys +import json +import time +from http.server import HTTPServer, BaseHTTPRequestHandler +from pathlib import Path + +SAVE_DIR = Path.home() / ".openclaw" / "media" / "visionclaw" +PORT = int(sys.argv[1]) if len(sys.argv) > 1 else 18792 + +SAVE_DIR.mkdir(parents=True, exist_ok=True) + +class UploadHandler(BaseHTTPRequestHandler): + def do_POST(self): + if self.path != "/upload": + self.send_response(404) + self.end_headers() + return + + content_length = int(self.headers.get("Content-Length", 0)) + if content_length == 0 or content_length > 10 * 1024 * 1024: # 10MB max + self.send_response(400) + self.end_headers() + self.wfile.write(b'{"error":"invalid size"}') + return + + body = self.rfile.read(content_length) + filename = f"frame-{int(time.time() * 1000)}.jpg" + filepath = SAVE_DIR / filename + filepath.write_bytes(body) + + response = json.dumps({"path": str(filepath), "size": len(body)}) + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(response.encode()) + print(f"Saved: {filepath} ({len(body)} bytes)") + + def do_GET(self): + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(b'{"status":"ok","service":"visionclaw-upload"}') + + def log_message(self, format, *args): + pass # suppress default logs + +if __name__ == "__main__": + server = HTTPServer(("0.0.0.0", PORT), UploadHandler) + print(f"VisionClaw upload server listening on port {PORT}") + print(f"Saving to: {SAVE_DIR}") + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nStopped") diff --git a/samples/CameraAccess/server/index.js b/samples/CameraAccess/server/index.js index dbb9149d..b2d98753 100644 --- a/samples/CameraAccess/server/index.js +++ b/samples/CameraAccess/server/index.js @@ -34,8 +34,91 @@ function getTurnCredentials() { }; } +// --- Session Logger --- +// Stores logs as JSONL files in ~/.openclaw/visionclaw-logs/ so OpenClaw +// can discover and analyze them via its file tools (read, exec, etc.). +// This makes logs "agent-extractable" — just ask OpenClaw to read them. +const os = require("os"); +const LOGS_DIR = path.join(os.homedir(), ".openclaw", "visionclaw-logs"); +if (!fs.existsSync(LOGS_DIR)) fs.mkdirSync(LOGS_DIR, { recursive: true }); +console.log(`[Logger] Writing logs to ${LOGS_DIR}`); + +function getLogFilePath() { + const date = new Date().toISOString().slice(0, 10); // YYYY-MM-DD + return path.join(LOGS_DIR, `visionclaw-${date}.jsonl`); +} + +function appendLog(entry) { + const line = JSON.stringify(entry) + "\n"; + fs.appendFile(getLogFilePath(), line, (err) => { + if (err) console.error("[Logger] Write error:", err.message); + }); +} + // HTTP server for serving the web viewer const httpServer = http.createServer((req, res) => { + // --- Logging API --- + if (req.url === "/api/logs" && req.method === "POST") { + let body = ""; + req.on("data", (chunk) => (body += chunk)); + req.on("end", () => { + try { + const payload = JSON.parse(body); + const entry = { + ts: new Date().toISOString(), + type: payload.type || "event", + session: payload.session || "unknown", + data: payload.data || payload, + }; + appendLog(entry); + res.writeHead(200, { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }); + res.end(JSON.stringify({ ok: true })); + } catch (e) { + res.writeHead(400, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ error: "Invalid JSON" })); + } + }); + return; + } + + if (req.url?.startsWith("/api/logs") && req.method === "GET") { + // Return recent logs from today's file + const logFile = getLogFilePath(); + if (!fs.existsSync(logFile)) { + res.writeHead(200, { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }); + res.end(JSON.stringify({ logs: [], count: 0 })); + return; + } + const lines = fs.readFileSync(logFile, "utf-8").trim().split("\n").filter(Boolean); + const count = parseInt(new URL(req.url, "http://localhost").searchParams.get("count") || "50"); + const logs = lines.slice(-count).reverse().map((l) => { + try { return JSON.parse(l); } catch { return null; } + }).filter(Boolean); + res.writeHead(200, { + "Content-Type": "application/json", + "Access-Control-Allow-Origin": "*", + }); + res.end(JSON.stringify({ logs, count: logs.length })); + return; + } + + // CORS preflight + if (req.method === "OPTIONS") { + res.writeHead(204, { + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Methods": "GET, POST, OPTIONS", + "Access-Control-Allow-Headers": "Content-Type, x-api-token", + }); + res.end(); + return; + } + // TURN credentials API endpoint if (req.url === "/api/turn") { const creds = getTurnCredentials(); diff --git a/samples/CameraAccessAndroid/app/src/main/AndroidManifest.xml b/samples/CameraAccessAndroid/app/src/main/AndroidManifest.xml index 5ed342a1..10a7c062 100644 --- a/samples/CameraAccessAndroid/app/src/main/AndroidManifest.xml +++ b/samples/CameraAccessAndroid/app/src/main/AndroidManifest.xml @@ -12,7 +12,10 @@ + + + + android:foregroundServiceType="connectedDevice|microphone" /> diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/MainActivity.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/MainActivity.kt index 7d3f20b7..ed17fffd 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/MainActivity.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/MainActivity.kt @@ -82,7 +82,6 @@ class MainActivity : ComponentActivity() { setContent { CameraAccessScaffold( viewModel = viewModel, - onRequestWearablesPermission = ::requestWearablesPermission, ) } } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/chat/ChatHistoryStore.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/chat/ChatHistoryStore.kt new file mode 100644 index 00000000..40a37e5a --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/chat/ChatHistoryStore.kt @@ -0,0 +1,87 @@ +package com.meta.wearable.dat.externalsampleapps.cameraaccess.chat + +import android.content.Context +import android.util.Log +import org.json.JSONArray +import org.json.JSONObject +import java.io.File + +object ChatHistoryStore { + private const val TAG = "ChatHistoryStore" + private const val FILENAME = "chat_history.json" + private const val MAX_MESSAGES = 500 + + fun save(context: Context, messages: List) { + try { + val json = JSONArray() + for (msg in messages.takeLast(MAX_MESSAGES)) { + json.put(JSONObject().apply { + put("id", msg.id) + put("role", serializeRole(msg.role)) + put("text", msg.text) + put("timestamp", msg.timestamp) + put("status", serializeStatus(msg.status)) + }) + } + File(context.filesDir, FILENAME).writeText(json.toString()) + } catch (e: Exception) { + Log.e(TAG, "Failed to save: ${e.message}") + } + } + + fun load(context: Context): List { + val file = File(context.filesDir, FILENAME) + if (!file.exists()) return emptyList() + return try { + val json = JSONArray(file.readText()) + val messages = mutableListOf() + for (i in 0 until json.length()) { + val obj = json.getJSONObject(i) + val rawStatus = obj.optString("status", "complete") + val text = obj.optString("text", "") + // Fix stale "Executing..." messages from interrupted sessions + val fixedText = if (rawStatus == "streaming" && text == "Executing...") "Cancelled" else text + messages.add(ChatMessage( + id = obj.getString("id"), + role = deserializeRole(obj.getString("role")), + text = fixedText, + timestamp = obj.getLong("timestamp"), + status = deserializeStatus(rawStatus), + )) + } + Log.d(TAG, "Loaded ${messages.size} messages") + messages + } catch (e: Exception) { + Log.e(TAG, "Failed to load: ${e.message}") + emptyList() + } + } + + private fun serializeRole(role: ChatMessageRole): String = when (role) { + is ChatMessageRole.User -> "user" + is ChatMessageRole.Assistant -> "assistant" + is ChatMessageRole.ToolCall -> "tool:${role.name}" + is ChatMessageRole.SessionDivider -> "divider" + } + + private fun deserializeRole(s: String): ChatMessageRole = when { + s == "user" -> ChatMessageRole.User + s == "assistant" -> ChatMessageRole.Assistant + s == "divider" -> ChatMessageRole.SessionDivider + s.startsWith("tool:") -> ChatMessageRole.ToolCall(s.removePrefix("tool:")) + else -> ChatMessageRole.Assistant + } + + private fun serializeStatus(status: ChatMessageStatus): String = when (status) { + is ChatMessageStatus.Streaming -> "streaming" + is ChatMessageStatus.Complete -> "complete" + is ChatMessageStatus.Error -> "error:${status.message}" + } + + private fun deserializeStatus(s: String): ChatMessageStatus = when { + s == "complete" -> ChatMessageStatus.Complete + s == "streaming" -> ChatMessageStatus.Complete // treat stale streaming as complete + s.startsWith("error:") -> ChatMessageStatus.Error(s.removePrefix("error:")) + else -> ChatMessageStatus.Complete + } +} diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/chat/ChatMessage.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/chat/ChatMessage.kt new file mode 100644 index 00000000..c5a9e5f4 --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/chat/ChatMessage.kt @@ -0,0 +1,24 @@ +package com.meta.wearable.dat.externalsampleapps.cameraaccess.chat + +import java.util.UUID + +data class ChatMessage( + val id: String = UUID.randomUUID().toString(), + val role: ChatMessageRole, + var text: String, + val timestamp: Long = System.currentTimeMillis(), + var status: ChatMessageStatus = ChatMessageStatus.Complete, +) + +sealed class ChatMessageRole { + data object User : ChatMessageRole() + data object Assistant : ChatMessageRole() + data class ToolCall(val name: String) : ChatMessageRole() + data object SessionDivider : ChatMessageRole() +} + +sealed class ChatMessageStatus { + data object Streaming : ChatMessageStatus() + data object Complete : ChatMessageStatus() + data class Error(val message: String) : ChatMessageStatus() +} diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gallery/CapturedPhoto.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gallery/CapturedPhoto.kt new file mode 100644 index 00000000..4fb0c0a1 --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gallery/CapturedPhoto.kt @@ -0,0 +1,8 @@ +package com.meta.wearable.dat.externalsampleapps.cameraaccess.gallery + +data class CapturedPhoto( + val id: String, + val filename: String, + val timestamp: Long, + val description: String? +) diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gallery/PhotoCaptureStore.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gallery/PhotoCaptureStore.kt new file mode 100644 index 00000000..16559439 --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gallery/PhotoCaptureStore.kt @@ -0,0 +1,120 @@ +package com.meta.wearable.dat.externalsampleapps.cameraaccess.gallery + +import android.content.Context +import android.graphics.Bitmap +import android.graphics.BitmapFactory +import android.util.Log +import kotlinx.coroutines.flow.MutableStateFlow +import kotlinx.coroutines.flow.StateFlow +import kotlinx.coroutines.flow.asStateFlow +import org.json.JSONArray +import org.json.JSONObject +import java.io.File +import java.io.FileOutputStream +import java.text.SimpleDateFormat +import java.util.Date +import java.util.Locale +import java.util.UUID + +object PhotoCaptureStore { + private const val TAG = "PhotoCaptureStore" + private const val MANIFEST_FILE = "manifest.json" + + private val _photos = MutableStateFlow>(emptyList()) + val photos: StateFlow> = _photos.asStateFlow() + + private fun capturesDir(context: Context): File { + val dir = File(context.filesDir, "captures") + if (!dir.exists()) dir.mkdirs() + return dir + } + + fun loadPhotos(context: Context) { + val manifestFile = File(capturesDir(context), MANIFEST_FILE) + if (!manifestFile.exists()) { + _photos.value = emptyList() + return + } + try { + val json = JSONArray(manifestFile.readText()) + val loaded = mutableListOf() + for (i in 0 until json.length()) { + val obj = json.getJSONObject(i) + val photo = CapturedPhoto( + id = obj.getString("id"), + filename = obj.getString("filename"), + timestamp = obj.getLong("timestamp"), + description = obj.optString("description", null) + ) + if (File(capturesDir(context), photo.filename).exists()) { + loaded.add(photo) + } + } + _photos.value = loaded + Log.d(TAG, "Loaded ${loaded.size} photos from manifest") + } catch (e: Exception) { + Log.e(TAG, "Failed to load manifest: ${e.message}") + _photos.value = emptyList() + } + } + + fun saveFrame(context: Context, bitmap: Bitmap, description: String?): CapturedPhoto? { + val formatter = SimpleDateFormat("yyyy-MM-dd_HH-mm-ss", Locale.US) + val filename = "capture_${formatter.format(Date())}.jpg" + val file = File(capturesDir(context), filename) + + return try { + FileOutputStream(file).use { out -> + bitmap.compress(Bitmap.CompressFormat.JPEG, 90, out) + } + val photo = CapturedPhoto( + id = UUID.randomUUID().toString(), + filename = filename, + timestamp = System.currentTimeMillis(), + description = description + ) + val current = _photos.value.toMutableList() + current.add(0, photo) + _photos.value = current + saveManifest(context) + Log.d(TAG, "Saved: $filename (${file.length()} bytes)") + photo + } catch (e: Exception) { + Log.e(TAG, "Failed to save photo: ${e.message}") + null + } + } + + fun deletePhoto(context: Context, photo: CapturedPhoto) { + File(capturesDir(context), photo.filename).delete() + _photos.value = _photos.value.filter { it.id != photo.id } + saveManifest(context) + Log.d(TAG, "Deleted: ${photo.filename}") + } + + fun getPhotoFile(context: Context, photo: CapturedPhoto): File { + return File(capturesDir(context), photo.filename) + } + + fun loadBitmap(context: Context, photo: CapturedPhoto): Bitmap? { + val file = getPhotoFile(context, photo) + return if (file.exists()) BitmapFactory.decodeFile(file.absolutePath) else null + } + + private fun saveManifest(context: Context) { + try { + val json = JSONArray() + for (photo in _photos.value) { + json.put(JSONObject().apply { + put("id", photo.id) + put("filename", photo.filename) + put("timestamp", photo.timestamp) + if (photo.description != null) put("description", photo.description) + }) + } + File(capturesDir(context), MANIFEST_FILE).writeText(json.toString(2)) + } catch (e: Exception) { + Log.e(TAG, "Failed to save manifest: ${e.message}") + } + } +} diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/AudioManager.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/AudioManager.kt index fb6268ee..b97e94f4 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/AudioManager.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/AudioManager.kt @@ -1,15 +1,24 @@ package com.meta.wearable.dat.externalsampleapps.cameraaccess.gemini import android.annotation.SuppressLint +import android.content.Context import android.media.AudioAttributes +import android.media.AudioDeviceInfo import android.media.AudioFormat import android.media.AudioRecord import android.media.AudioTrack import android.media.MediaRecorder +import android.media.audiofx.AcousticEchoCanceler +import android.media.audiofx.AutomaticGainControl +import android.media.audiofx.NoiseSuppressor +import android.os.Build import android.util.Log +import com.meta.wearable.dat.externalsampleapps.cameraaccess.settings.SettingsManager import java.io.ByteArrayOutputStream +import java.util.concurrent.Executors +import java.util.concurrent.RejectedExecutionException -class AudioManager { +class AudioManager(private val appContext: Context) { companion object { private const val TAG = "AudioManager" private const val MIN_SEND_BYTES = 3200 // 100ms at 16kHz mono Int16 = 1600 frames * 2 bytes @@ -19,23 +28,340 @@ class AudioManager { private var audioRecord: AudioRecord? = null private var audioTrack: AudioTrack? = null + private var echoCanceler: AcousticEchoCanceler? = null + private var noiseSuppressor: NoiseSuppressor? = null + private var automaticGainControl: AutomaticGainControl? = null private var captureThread: Thread? = null + private val playbackExecutor = Executors.newSingleThreadExecutor { runnable -> + Thread(runnable, "audio-playback").apply { isDaemon = true } + } + private val playbackLock = Any() + @Volatile private var isCapturing = false + + @Volatile + private var playbackGeneration = 0 + + @Volatile + private var micEnabled = true + private val accumulatedData = ByteArrayOutputStream() private val accumulateLock = Any() + private var commDeviceSet = false + private var scoStarted = false + private var preferredBtDevice: AudioDeviceInfo? = null + private var preferredBtInputDevice: AudioDeviceInfo? = null + private var preferredBtOutputDevice: AudioDeviceInfo? = null + private var lastInputLevelLogMs = 0L + private var lastPlaybackLevelLogMs = 0L + private var silentInputLevelLogs = 0 + private var fellBackToBuiltInMic = false + + private data class BluetoothAudioRoute( + val communicationDevice: AudioDeviceInfo?, + val inputDevice: AudioDeviceInfo?, + val outputDevice: AudioDeviceInfo?, + ) + + /** + * "Mic mute" without tearing down the whole Gemini session. + * + * - enabled=false: we still keep AudioRecord running (so routing stays stable), + * but we DO NOT forward audio chunks to Gemini. + * - when toggling, we clear any buffered audio to avoid "catch-up" sending. + */ + fun setMicEnabled(enabled: Boolean) { + micEnabled = enabled + synchronized(accumulateLock) { + accumulatedData.reset() + } + Log.d(TAG, "Mic enabled = $micEnabled") + } + + fun isMicEnabled(): Boolean = micEnabled + @SuppressLint("MissingPermission") fun startCapture() { if (isCapturing) return + val sysAm = appContext.getSystemService(Context.AUDIO_SERVICE) as android.media.AudioManager + val demoSpeakerMode = SettingsManager.demoSpeakerModeEnabled + var useBluetoothMediaOutputOnly = false + + if (demoSpeakerMode) { + sysAm.mode = android.media.AudioManager.MODE_IN_COMMUNICATION + commDeviceSet = false + scoStarted = false + preferredBtDevice = null + preferredBtInputDevice = null + preferredBtOutputDevice = null + Log.d(TAG, "Demo speaker mode enabled -> use phone-style communication input without BT SCO") + } else { + val bluetoothRoute = findBluetoothAudioRoute(sysAm) + preferredBtDevice = bluetoothRoute.communicationDevice + preferredBtInputDevice = bluetoothRoute.inputDevice + preferredBtOutputDevice = bluetoothRoute.outputDevice + + if (bluetoothRoute.outputDevice != null && isBluetoothMediaOutput(bluetoothRoute.outputDevice)) { + useBluetoothMediaOutputOnly = true + preferredBtInputDevice = null + commDeviceSet = false + scoStarted = false + try { + sysAm.mode = android.media.AudioManager.MODE_NORMAL + } catch (t: Throwable) { + Log.w(TAG, "MODE_NORMAL for Bluetooth media output failed: ${t.message}") + } + Log.d( + TAG, + "Bluetooth media output detected -> use built-in mic with media output, " + + "output=${describeDevice(bluetoothRoute.outputDevice)}" + ) + } else if (bluetoothRoute.communicationDevice != null || bluetoothRoute.inputDevice != null) { + sysAm.mode = android.media.AudioManager.MODE_IN_COMMUNICATION + + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S && bluetoothRoute.communicationDevice != null) { + try { + commDeviceSet = sysAm.setCommunicationDevice(bluetoothRoute.communicationDevice) + Log.d(TAG, "setCommunicationDevice(BT) = $commDeviceSet, dev=${describeDevice(bluetoothRoute.communicationDevice)}") + } catch (t: Throwable) { + commDeviceSet = false + Log.w(TAG, "setCommunicationDevice failed: ${t.message}") + } + } + + try { + sysAm.startBluetoothSco() + sysAm.isBluetoothScoOn = true + scoStarted = true + waitForBluetoothSco(sysAm) + Log.d(TAG, "Bluetooth SCO started") + } catch (t: Throwable) { + scoStarted = false + Log.w(TAG, "startBluetoothSco failed: ${t.message}") + } + } else { + commDeviceSet = false + scoStarted = false + Log.d(TAG, "No BT mic -> fallback to phone mic") + } + logBluetoothRoute("selected", bluetoothRoute) + } + val bufferSize = AudioRecord.getMinBufferSize( GeminiConfig.INPUT_AUDIO_SAMPLE_RATE, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT ) - audioRecord = AudioRecord( + val preferredInputDevice = + if (demoSpeakerMode || useBluetoothMediaOutputOnly) findBuiltInMicOrNull() else preferredBtInputDevice + audioRecord = buildAudioRecord(preferredInputDevice, bufferSize) + + val routed = audioRecord?.routedDevice + Log.d(TAG, "AudioRecord routedDevice: type=${routed?.type} name=${routed?.productName}") + + if (demoSpeakerMode) { + enableVoiceProcessing(audioRecord?.audioSessionId ?: 0) + } + + val newAudioTrack = buildAudioTrack( + useMediaOutput = demoSpeakerMode || useBluetoothMediaOutputOnly, + preferredOutputDevice = if (demoSpeakerMode) null else preferredBtOutputDevice, + ) + + audioRecord?.startRecording() + synchronized(playbackLock) { + playbackGeneration++ + audioTrack = newAudioTrack + try { + newAudioTrack.play() + } catch (t: Throwable) { + Log.w(TAG, "AudioTrack.play failed: ${t.message}") + } + } + isCapturing = true + + synchronized(accumulateLock) { + accumulatedData.reset() + } + silentInputLevelLogs = 0 + fellBackToBuiltInMic = false + + captureThread = Thread( + { + val buffer = ByteArray(bufferSize) + var tapCount = 0 + while (isCapturing) { + val read = audioRecord?.read(buffer, 0, buffer.size) ?: break + if (read > 0) { + logInputLevelIfNeeded(buffer, read) + + if (!micEnabled) { + // Mic muted: discard data and clear any partial buffer. + synchronized(accumulateLock) { + accumulatedData.reset() + } + continue + } + + tapCount++ + synchronized(accumulateLock) { + accumulatedData.write(buffer, 0, read) + if (accumulatedData.size() >= MIN_SEND_BYTES) { + val chunk = accumulatedData.toByteArray() + accumulatedData.reset() + if (tapCount <= 3) { + Log.d(TAG, "Sending chunk: ${chunk.size} bytes (~${chunk.size / 32}ms)") + } + onAudioCaptured?.invoke(chunk) + } + } + } + } + }, + "audio-capture" + ).also { it.start() } + + Log.d( + TAG, + "Audio capture started (16kHz mono PCM16, demoSpeakerMode=$demoSpeakerMode, " + + "useBluetoothMediaOutputOnly=$useBluetoothMediaOutputOnly)" + ) + } + + private fun logInputLevelIfNeeded(buffer: ByteArray, byteCount: Int) { + val now = System.currentTimeMillis() + if (now - lastInputLevelLogMs < 1000) return + lastInputLevelLogMs = now + + var sumSquares = 0.0 + var peak = 0 + var i = 0 + while (i + 1 < byteCount) { + val sample = ((buffer[i + 1].toInt() shl 8) or (buffer[i].toInt() and 0xff)).toShort().toInt() + val abs = kotlin.math.abs(sample) + if (abs > peak) peak = abs + sumSquares += sample.toDouble() * sample.toDouble() + i += 2 + } + val samples = byteCount / 2 + if (samples == 0) return + + val rms = kotlin.math.sqrt(sumSquares / samples) + Log.d( + TAG, + "Input level rms=${rms.toInt()} peak=$peak device=${describeDevice(audioRecord?.routedDevice)}" + ) + + if (preferredBtInputDevice != null && !fellBackToBuiltInMic && rms < 1.0 && peak == 0) { + silentInputLevelLogs++ + if (silentInputLevelLogs >= 3) { + switchToBuiltInMicAfterSilentBluetooth() + } + } else { + silentInputLevelLogs = 0 + } + } + + private fun switchToBuiltInMicAfterSilentBluetooth() { + val builtInMic = findBuiltInMicOrNull() + if (builtInMic == null) { + Log.w(TAG, "Bluetooth input is silent, but no built-in mic fallback was found") + return + } + + try { + leaveBluetoothCommunicationRoute() + val ok = rebuildAudioRecord(builtInMic) + fellBackToBuiltInMic = true + silentInputLevelLogs = 0 + Log.w( + TAG, + "Bluetooth input stayed silent; rebuilt capture for built-in mic " + + "preferredOk=$ok dev=${describeDevice(builtInMic)} routed=${describeDevice(audioRecord?.routedDevice)}" + ) + if (ok) { + switchPlaybackToMediaBluetoothAfterMicFallback() + } + } catch (t: Throwable) { + Log.w(TAG, "Built-in mic fallback failed: ${t.message}") + } + } + + private fun switchPlaybackToMediaBluetoothAfterMicFallback() { + leaveBluetoothCommunicationRoute() + + try { + Thread.sleep(150) + val mediaOutput = findBluetoothMediaOutputDeviceOrNull() + preferredBtOutputDevice = mediaOutput ?: preferredBtOutputDevice + + val newTrack = buildAudioTrack( + useMediaOutput = true, + preferredOutputDevice = preferredBtOutputDevice, + ) + + synchronized(playbackLock) { + playbackGeneration++ + val oldTrack = audioTrack + audioTrack = newTrack + try { + newTrack.play() + } catch (t: Throwable) { + Log.w(TAG, "Fallback AudioTrack.play failed: ${t.message}") + } + try { + oldTrack?.stop() + } catch (_: Throwable) { + } + try { + oldTrack?.release() + } catch (_: Throwable) { + } + } + Log.w(TAG, "Switched playback to media Bluetooth output dev=${describeDevice(preferredBtOutputDevice)}") + } catch (t: Throwable) { + Log.w(TAG, "Media Bluetooth playback fallback failed: ${t.message}") + } + } + + private fun leaveBluetoothCommunicationRoute() { + val sysAm = appContext.getSystemService(Context.AUDIO_SERVICE) as android.media.AudioManager + + try { + if (scoStarted) { + sysAm.stopBluetoothSco() + sysAm.isBluetoothScoOn = false + scoStarted = false + } + } catch (t: Throwable) { + Log.w(TAG, "stopBluetoothSco during fallback failed: ${t.message}") + } + + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S && commDeviceSet) { + try { + sysAm.clearCommunicationDevice() + commDeviceSet = false + } catch (t: Throwable) { + Log.w(TAG, "clearCommunicationDevice during fallback failed: ${t.message}") + } + } + + try { + sysAm.mode = android.media.AudioManager.MODE_NORMAL + } catch (t: Throwable) { + Log.w(TAG, "MODE_NORMAL during fallback failed: ${t.message}") + } + } + + private fun buildAudioRecord( + preferredInputDevice: AudioDeviceInfo?, + bufferSize: Int, + ): AudioRecord { + val record = AudioRecord( MediaRecorder.AudioSource.VOICE_COMMUNICATION, GeminiConfig.INPUT_AUDIO_SAMPLE_RATE, AudioFormat.CHANNEL_IN_MONO, @@ -43,10 +369,61 @@ class AudioManager { bufferSize ) - audioTrack = AudioTrack.Builder() + preferredInputDevice?.let { dev -> + try { + val ok = record.setPreferredDevice(dev) + Log.d(TAG, "AudioRecord.setPreferredDevice ok=$ok dev=${describeDevice(dev)}") + } catch (t: Throwable) { + Log.w(TAG, "setPreferredDevice failed: ${t.message}") + } + } + + return record + } + + private fun rebuildAudioRecord(preferredInputDevice: AudioDeviceInfo): Boolean { + val bufferSize = AudioRecord.getMinBufferSize( + GeminiConfig.INPUT_AUDIO_SAMPLE_RATE, + AudioFormat.CHANNEL_IN_MONO, + AudioFormat.ENCODING_PCM_16BIT + ) + + val oldRecord = audioRecord + audioRecord = null + try { + oldRecord?.stop() + } catch (_: Throwable) { + } + try { + oldRecord?.release() + } catch (_: Throwable) { + } + + val newRecord = buildAudioRecord(preferredInputDevice, bufferSize) + audioRecord = newRecord + newRecord.startRecording() + + synchronized(accumulateLock) { + accumulatedData.reset() + } + + return newRecord.preferredDevice?.id == preferredInputDevice.id + } + + private fun buildAudioTrack( + useMediaOutput: Boolean, + preferredOutputDevice: AudioDeviceInfo?, + ): AudioTrack { + val track = AudioTrack.Builder() .setAudioAttributes( AudioAttributes.Builder() - .setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION) + .setUsage( + if (useMediaOutput) { + AudioAttributes.USAGE_MEDIA + } else { + AudioAttributes.USAGE_VOICE_COMMUNICATION + } + ) .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) .build() ) @@ -67,48 +444,266 @@ class AudioManager { ) .build() - audioRecord?.startRecording() - audioTrack?.play() - isCapturing = true + preferredOutputDevice?.let { dev -> + try { + val ok = track.setPreferredDevice(dev) + Log.d(TAG, "AudioTrack.setPreferredDevice ok=$ok dev=${describeDevice(dev)} media=$useMediaOutput") + } catch (t: Throwable) { + Log.w(TAG, "AudioTrack.setPreferredDevice failed: ${t.message}") + } + } - synchronized(accumulateLock) { - accumulatedData.reset() + return track + } + + private fun waitForBluetoothSco(sysAm: android.media.AudioManager) { + repeat(12) { + if (sysAm.isBluetoothScoOn) return + Thread.sleep(100) } + } - captureThread = Thread({ - val buffer = ByteArray(bufferSize) - var tapCount = 0 - while (isCapturing) { - val read = audioRecord?.read(buffer, 0, buffer.size) ?: break - if (read > 0) { - tapCount++ - synchronized(accumulateLock) { - accumulatedData.write(buffer, 0, read) - if (accumulatedData.size() >= MIN_SEND_BYTES) { - val chunk = accumulatedData.toByteArray() - accumulatedData.reset() - if (tapCount <= 3) { - Log.d(TAG, "Sending chunk: ${chunk.size} bytes (~${chunk.size / 32}ms)") - } - onAudioCaptured?.invoke(chunk) - } - } + private fun findBluetoothAudioRoute(sysAm: android.media.AudioManager): BluetoothAudioRoute { + val inputs = sysAm.getDevices(android.media.AudioManager.GET_DEVICES_INPUTS) + val outputs = sysAm.getDevices(android.media.AudioManager.GET_DEVICES_OUTPUTS) + + val communicationDevice = + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) { + sysAm.availableCommunicationDevices.firstOrNull { isBluetoothDevice(it) } + } else { + null + } + val inputDevice = + inputs.firstOrNull { it.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO } + ?: inputs.firstOrNull { isBluetoothDevice(it) } + ?: communicationDevice?.takeIf { it.isSource } + val outputDevice = + outputs.firstOrNull { it.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP } + ?: outputs.firstOrNull { + Build.VERSION.SDK_INT >= Build.VERSION_CODES.S && + (it.type == AudioDeviceInfo.TYPE_BLE_HEADSET || + it.type == AudioDeviceInfo.TYPE_BLE_SPEAKER) } + ?: outputs.firstOrNull { it.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO } + ?: communicationDevice?.takeIf { it.isSink } + + logAudioDevices(sysAm, inputs, outputs) + + return BluetoothAudioRoute( + communicationDevice = communicationDevice, + inputDevice = inputDevice, + outputDevice = outputDevice, + ) + } + + private fun isBluetoothDevice(device: AudioDeviceInfo): Boolean { + return when (device.type) { + AudioDeviceInfo.TYPE_BLUETOOTH_SCO, + AudioDeviceInfo.TYPE_BLUETOOTH_A2DP -> true + else -> + Build.VERSION.SDK_INT >= Build.VERSION_CODES.S && + (device.type == AudioDeviceInfo.TYPE_BLE_HEADSET || + device.type == AudioDeviceInfo.TYPE_BLE_SPEAKER) + } + } + + private fun isBluetoothMediaOutput(device: AudioDeviceInfo): Boolean { + return when (device.type) { + AudioDeviceInfo.TYPE_BLUETOOTH_A2DP -> true + else -> + Build.VERSION.SDK_INT >= Build.VERSION_CODES.S && + (device.type == AudioDeviceInfo.TYPE_BLE_HEADSET || + device.type == AudioDeviceInfo.TYPE_BLE_SPEAKER) + } + } + + private fun logAudioDevices( + sysAm: android.media.AudioManager, + inputs: Array, + outputs: Array, + ) { + Log.d(TAG, "Audio inputs: ${inputs.joinToString { describeDevice(it) }}") + Log.d(TAG, "Audio outputs: ${outputs.joinToString { describeDevice(it) }}") + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) { + Log.d( + TAG, + "Communication devices: ${sysAm.availableCommunicationDevices.joinToString { describeDevice(it) }}" + ) + } + } + + private fun logBluetoothRoute(label: String, route: BluetoothAudioRoute) { + Log.d( + TAG, + "Bluetooth route $label: communication=${describeDevice(route.communicationDevice)}, " + + "input=${describeDevice(route.inputDevice)}, output=${describeDevice(route.outputDevice)}" + ) + } + + private fun describeDevice(device: AudioDeviceInfo?): String { + return device?.let { + "type=${it.type}, name=${it.productName}, source=${it.isSource}, sink=${it.isSink}" + } ?: "none" + } + + private fun findBuiltInMicOrNull(): AudioDeviceInfo? { + val sysAm = appContext.getSystemService(Context.AUDIO_SERVICE) as android.media.AudioManager + val inputs = sysAm.getDevices(android.media.AudioManager.GET_DEVICES_INPUTS) + return inputs.firstOrNull { it.type == AudioDeviceInfo.TYPE_BUILTIN_MIC } + } + + private fun findBuiltInSpeakerOrNull(): AudioDeviceInfo? { + val sysAm = appContext.getSystemService(Context.AUDIO_SERVICE) as android.media.AudioManager + val outputs = sysAm.getDevices(android.media.AudioManager.GET_DEVICES_OUTPUTS) + return outputs.firstOrNull { it.type == AudioDeviceInfo.TYPE_BUILTIN_SPEAKER } + } + + private fun findBluetoothMediaOutputDeviceOrNull(): AudioDeviceInfo? { + val sysAm = appContext.getSystemService(Context.AUDIO_SERVICE) as android.media.AudioManager + val outputs = sysAm.getDevices(android.media.AudioManager.GET_DEVICES_OUTPUTS) + return outputs.firstOrNull { it.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP } + ?: outputs.firstOrNull { + Build.VERSION.SDK_INT >= Build.VERSION_CODES.S && + (it.type == AudioDeviceInfo.TYPE_BLE_HEADSET || + it.type == AudioDeviceInfo.TYPE_BLE_SPEAKER) } - }, "audio-capture").also { it.start() } + ?: outputs.firstOrNull { it.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO } + } + + private fun enableVoiceProcessing(audioSessionId: Int) { + if (audioSessionId == 0) return + + if (AcousticEchoCanceler.isAvailable()) { + try { + echoCanceler = AcousticEchoCanceler.create(audioSessionId)?.apply { enabled = true } + Log.d(TAG, "AcousticEchoCanceler enabled=${echoCanceler?.enabled}") + } catch (t: Throwable) { + Log.w(TAG, "AcousticEchoCanceler failed: ${t.message}") + } + } - Log.d(TAG, "Audio capture started (16kHz mono PCM16)") + if (NoiseSuppressor.isAvailable()) { + try { + noiseSuppressor = NoiseSuppressor.create(audioSessionId)?.apply { enabled = true } + Log.d(TAG, "NoiseSuppressor enabled=${noiseSuppressor?.enabled}") + } catch (t: Throwable) { + Log.w(TAG, "NoiseSuppressor failed: ${t.message}") + } + } + + if (AutomaticGainControl.isAvailable()) { + try { + automaticGainControl = AutomaticGainControl.create(audioSessionId)?.apply { enabled = true } + Log.d(TAG, "AutomaticGainControl enabled=${automaticGainControl?.enabled}") + } catch (t: Throwable) { + Log.w(TAG, "AutomaticGainControl failed: ${t.message}") + } + } + } + + private fun releaseVoiceProcessing() { + echoCanceler?.release() + echoCanceler = null + noiseSuppressor?.release() + noiseSuppressor = null + automaticGainControl?.release() + automaticGainControl = null } fun playAudio(data: ByteArray) { if (!isCapturing || data.isEmpty()) return - audioTrack?.write(data, 0, data.size) + val generation = playbackGeneration + val chunk = data.copyOf() + try { + playbackExecutor.execute { + if (!isCapturing || generation != playbackGeneration) return@execute + synchronized(playbackLock) { + if (!isCapturing || generation != playbackGeneration) return@synchronized + val track = audioTrack ?: return@synchronized + try { + ensurePreferredPlaybackDevice(track) + val written = track.write(chunk, 0, chunk.size) + if (written < 0) { + Log.w(TAG, "AudioTrack.write failed: $written") + } else { + logPlaybackLevelIfNeeded(track, chunk, written) + } + } catch (t: Throwable) { + Log.w(TAG, "AudioTrack.write threw: ${t.message}") + } + } + } + } catch (t: RejectedExecutionException) { + Log.w(TAG, "Playback executor rejected audio: ${t.message}") + } + } + + private fun ensurePreferredPlaybackDevice(track: AudioTrack) { + val routed = track.routedDevice + if (routed != null && isBluetoothMediaOutput(routed)) return + + val mediaOutput = findBluetoothMediaOutputDeviceOrNull() ?: preferredBtOutputDevice ?: return + preferredBtOutputDevice = mediaOutput + + try { + val ok = track.setPreferredDevice(mediaOutput) + Log.w( + TAG, + "Reassert playback Bluetooth output ok=$ok dev=${describeDevice(mediaOutput)} " + + "previous=${describeDevice(routed)}" + ) + } catch (t: Throwable) { + Log.w(TAG, "Reassert playback Bluetooth output threw: ${t.message}") + } + } + + private fun logPlaybackLevelIfNeeded(track: AudioTrack, buffer: ByteArray, byteCount: Int) { + val now = System.currentTimeMillis() + if (now - lastPlaybackLevelLogMs < 1000) return + lastPlaybackLevelLogMs = now + + val level = computePcmLevel(buffer, byteCount) + Log.d( + TAG, + "Playback write bytes=$byteCount rms=${level.first} peak=${level.second} device=${describeDevice(track.routedDevice)}" + ) + } + + private fun computePcmLevel(buffer: ByteArray, byteCount: Int): Pair { + var sumSquares = 0.0 + var peak = 0 + var i = 0 + while (i + 1 < byteCount) { + val sample = ((buffer[i + 1].toInt() shl 8) or (buffer[i].toInt() and 0xff)).toShort().toInt() + val abs = kotlin.math.abs(sample) + if (abs > peak) peak = abs + sumSquares += sample.toDouble() * sample.toDouble() + i += 2 + } + val samples = byteCount / 2 + if (samples == 0) return 0 to 0 + return kotlin.math.sqrt(sumSquares / samples).toInt() to peak } fun stopPlayback() { - audioTrack?.pause() - audioTrack?.flush() - audioTrack?.play() + val generation = playbackGeneration + try { + playbackExecutor.execute { + synchronized(playbackLock) { + if (generation != playbackGeneration) return@synchronized + val track = audioTrack ?: return@synchronized + try { + track.pause() + track.flush() + track.play() + } catch (t: Throwable) { + Log.w(TAG, "stopPlayback failed: ${t.message}") + } + } + } + } catch (t: RejectedExecutionException) { + Log.w(TAG, "Playback executor rejected stopPlayback: ${t.message}") + } } fun stopCapture() { @@ -120,20 +715,60 @@ class AudioManager { // Flush remaining accumulated audio synchronized(accumulateLock) { - if (accumulatedData.size() > 0) { + if (micEnabled && accumulatedData.size() > 0) { val chunk = accumulatedData.toByteArray() accumulatedData.reset() onAudioCaptured?.invoke(chunk) + } else { + accumulatedData.reset() } } audioRecord?.stop() + releaseVoiceProcessing() audioRecord?.release() audioRecord = null - audioTrack?.stop() - audioTrack?.release() - audioTrack = null + synchronized(playbackLock) { + playbackGeneration++ + val track = audioTrack + audioTrack = null + try { + track?.stop() + } catch (t: Throwable) { + Log.w(TAG, "AudioTrack.stop failed: ${t.message}") + } + try { + track?.release() + } catch (t: Throwable) { + Log.w(TAG, "AudioTrack.release failed: ${t.message}") + } + } + + val sysAm = appContext.getSystemService(Context.AUDIO_SERVICE) as android.media.AudioManager + + if (scoStarted) { + try { + sysAm.stopBluetoothSco() + sysAm.isBluetoothScoOn = false + } catch (_: Throwable) { + } + scoStarted = false + } + + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S && commDeviceSet) { + try { + sysAm.clearCommunicationDevice() + } catch (_: Throwable) { + } + commDeviceSet = false + } + + preferredBtDevice = null + preferredBtInputDevice = null + preferredBtOutputDevice = null + + sysAm.mode = android.media.AudioManager.MODE_NORMAL Log.d(TAG, "Audio capture stopped") } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiConfig.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiConfig.kt index 10ba908e..49c8b632 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiConfig.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiConfig.kt @@ -16,7 +16,7 @@ object GeminiConfig { const val VIDEO_JPEG_QUALITY = 50 val systemInstruction: String - get() = SettingsManager.geminiSystemPrompt + get() = SettingsManager.geminiSystemPrompt.trimEnd() + TOOL_RESULT_ADDENDUM val apiKey: String get() = SettingsManager.geminiAPIKey @@ -45,4 +45,17 @@ object GeminiConfig { get() = openClawGatewayToken != "YOUR_OPENCLAW_GATEWAY_TOKEN" && openClawGatewayToken.isNotEmpty() && openClawHost != "http://YOUR_MAC_HOSTNAME.local" + + private const val TOOL_RESULT_ADDENDUM = """ + +-------------------------------- +TOOL RESULT HANDLING +-------------------------------- + +When execute returns a result, immediately answer the user using that result. +Do not end the turn with only a brief acknowledgment after execute has returned. +If you did not manage to say the acknowledgment before calling execute, do not say it after the result arrives; use the result instead. +Keep the final answer concise and in the user's conversation language. +If the user's utterance contains Japanese, use Japanese for both the pre-tool acknowledgment and the final answer. +""" } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiLiveService.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiLiveService.kt index d046d306..81a7e7f0 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiLiveService.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiLiveService.kt @@ -42,6 +42,10 @@ class GeminiLiveService { private val _isModelSpeaking = MutableStateFlow(false) val isModelSpeaking: StateFlow = _isModelSpeaking.asStateFlow() + // Debug: last disconnect/failure detail + private val _lastDisconnectInfo = MutableStateFlow(null) + val lastDisconnectInfo: StateFlow = _lastDisconnectInfo.asStateFlow() + var onAudioReceived: ((ByteArray) -> Unit)? = null var onTurnComplete: (() -> Unit)? = null var onInterrupted: (() -> Unit)? = null @@ -60,15 +64,19 @@ class GeminiLiveService { private var connectCallback: ((Boolean) -> Unit)? = null private var timeoutTimer: Timer? = null + // NOTE: user said pingInterval already increased; keep your current value here. + // If you want, change 10 -> 30/60. private val client = OkHttpClient.Builder() .readTimeout(0, TimeUnit.MILLISECONDS) - .pingInterval(10, TimeUnit.SECONDS) + .pingInterval(30, TimeUnit.SECONDS) + .retryOnConnectionFailure(true) .build() fun connect(callback: (Boolean) -> Unit) { val url = GeminiConfig.websocketURL() if (url == null) { _connectionState.value = GeminiConnectionState.Error("No API key configured") + _lastDisconnectInfo.value = "No API key configured" callback(false) return } @@ -93,24 +101,40 @@ class GeminiLiveService { } override fun onFailure(webSocket: WebSocket, t: Throwable, response: Response?) { - val msg = t.message ?: "Unknown error" - Log.e(TAG, "WebSocket failure: $msg") - _connectionState.value = GeminiConnectionState.Error(msg) + val detail = buildString { + append("WS failure: ") + append(t::class.java.name) + append(": ") + append(t.message ?: "no-message") + if (response != null) { + append(" | HTTP ") + append(response.code) + append(" ") + append(response.message) + } + } + + Log.e(TAG, detail, t) + _lastDisconnectInfo.value = detail + _connectionState.value = GeminiConnectionState.Error(detail) _isModelSpeaking.value = false resolveConnect(false) - onDisconnected?.invoke(msg) + onDisconnected?.invoke(detail) } override fun onClosing(webSocket: WebSocket, code: Int, reason: String) { - Log.d(TAG, "WebSocket closing: $code $reason") + val detail = "WS closing: code=$code reason=$reason" + Log.w(TAG, detail) + _lastDisconnectInfo.value = detail _connectionState.value = GeminiConnectionState.Disconnected _isModelSpeaking.value = false resolveConnect(false) - onDisconnected?.invoke("Connection closed (code $code: $reason)") + onDisconnected?.invoke(detail) } override fun onClosed(webSocket: WebSocket, code: Int, reason: String) { Log.d(TAG, "WebSocket closed: $code $reason") + _lastDisconnectInfo.value = "WS closed: code=$code reason=$reason" _connectionState.value = GeminiConnectionState.Disconnected _isModelSpeaking.value = false } @@ -122,9 +146,13 @@ class GeminiLiveService { override fun run() { if (_connectionState.value == GeminiConnectionState.Connecting || _connectionState.value == GeminiConnectionState.SettingUp) { - Log.e(TAG, "Connection timed out") - _connectionState.value = GeminiConnectionState.Error("Connection timed out") + val detail = "WS connect/setup timed out (15s)" + Log.e(TAG, detail) + _lastDisconnectInfo.value = detail + _connectionState.value = GeminiConnectionState.Error(detail) + webSocket?.cancel() resolveConnect(false) + onDisconnected?.invoke(detail) } } }, 15000) @@ -136,8 +164,8 @@ class GeminiLiveService { timeoutTimer = null webSocket?.close(1000, null) webSocket = null - onToolCall = null - onToolCallCancellation = null +// onToolCall = null +// onToolCallCancellation = null _connectionState.value = GeminiConnectionState.Disconnected _isModelSpeaking.value = false resolveConnect(false) @@ -155,6 +183,7 @@ class GeminiLiveService { }) }) } + Log.d("GeminiWS", "SEND_AUDIO_CHUNK") webSocket?.send(json.toString()) } } @@ -173,13 +202,16 @@ class GeminiLiveService { }) }) } + Log.d("GeminiWS", "SEND_VIDEO_FRAME") webSocket?.send(json.toString()) } } fun sendToolResponse(response: JSONObject) { sendExecutor.execute { - webSocket?.send(response.toString()) + val payload = response.toString() + val sent = webSocket?.send(payload) ?: false + Log.d("GeminiWS", "SEND_TOOL sent=$sent bytes=${payload.length}: ${payload.take(300)}") } } @@ -194,6 +226,7 @@ class GeminiLiveService { put("text", text) })) })) + put("turnComplete", true) }) } webSocket?.send(json.toString()) @@ -249,6 +282,7 @@ class GeminiLiveService { }) } // Send directly (not via sendExecutor) to ensure it's the first message + Log.d("GeminiWS", "SEND_SETUP") webSocket?.send(setup.toString()) } @@ -266,10 +300,21 @@ class GeminiLiveService { // GoAway if (json.has("goAway")) { val goAway = json.getJSONObject("goAway") - val seconds = goAway.optJSONObject("timeLeft")?.optInt("seconds", 0) ?: 0 + val detail = goAway.optString("detail", "server requested disconnect") + Log.w(TAG, "Gemini goAway: $detail") + + val ws = webSocket + webSocket = null + + try { + ws?.close(1000, detail) + } catch (e: Exception) { + Log.w(TAG, "Error closing websocket on goAway", e) + } + _connectionState.value = GeminiConnectionState.Disconnected _isModelSpeaking.value = false - onDisconnected?.invoke("Server closing (time left: ${seconds}s)") + onDisconnected?.invoke(detail) return } @@ -294,6 +339,7 @@ class GeminiLiveService { val serverContent = json.getJSONObject("serverContent") if (serverContent.optBoolean("interrupted", false)) { + Log.d(TAG, "serverContent interrupted") _isModelSpeaking.value = false onInterrupted?.invoke() return @@ -312,6 +358,7 @@ class GeminiLiveService { val base64Data = inlineData.optString("data", "") if (base64Data.isNotEmpty()) { val audioData = Base64.decode(base64Data, Base64.DEFAULT) + Log.d(TAG, "Audio received bytes=${audioData.size}") if (!_isModelSpeaking.value) { _isModelSpeaking.value = true if (lastUserSpeechEnd > 0 && !responseLatencyLogged) { @@ -331,6 +378,7 @@ class GeminiLiveService { } if (serverContent.optBoolean("turnComplete", false)) { + Log.d(TAG, "serverContent turnComplete") _isModelSpeaking.value = false responseLatencyLogged = false onTurnComplete?.invoke() diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiProgressSpeechService.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiProgressSpeechService.kt new file mode 100644 index 00000000..cea3372b --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiProgressSpeechService.kt @@ -0,0 +1,168 @@ +package com.meta.wearable.dat.externalsampleapps.cameraaccess.gemini + +import android.util.Base64 +import android.util.Log +import java.util.concurrent.Executors +import java.util.concurrent.TimeUnit +import okhttp3.OkHttpClient +import okhttp3.Request +import okhttp3.Response +import okhttp3.WebSocket +import okhttp3.WebSocketListener +import okio.ByteString +import org.json.JSONArray +import org.json.JSONObject + +class GeminiProgressSpeechService { + companion object { + private const val TAG = "GeminiProgressSpeech" + } + + var onAudioReceived: ((ByteArray) -> Unit)? = null + + private val client = OkHttpClient.Builder() + .readTimeout(0, TimeUnit.MILLISECONDS) + .pingInterval(30, TimeUnit.SECONDS) + .retryOnConnectionFailure(true) + .build() + private val sendExecutor = Executors.newSingleThreadExecutor() + private var webSocket: WebSocket? = null + private var ready = false + private val pendingPhrases = ArrayDeque() + + fun connect() { + if (ready || webSocket != null) return + val url = GeminiConfig.websocketURL() ?: return + val request = Request.Builder().url(url).build() + webSocket = client.newWebSocket(request, object : WebSocketListener() { + override fun onOpen(webSocket: WebSocket, response: Response) { + Log.d(TAG, "WebSocket opened") + sendSetup(webSocket) + } + + override fun onMessage(webSocket: WebSocket, text: String) { + handleMessage(text) + } + + override fun onMessage(webSocket: WebSocket, bytes: ByteString) { + handleMessage(bytes.utf8()) + } + + override fun onFailure(webSocket: WebSocket, t: Throwable, response: Response?) { + Log.w(TAG, "WebSocket failure: ${t.message}") + ready = false + this@GeminiProgressSpeechService.webSocket = null + } + + override fun onClosed(webSocket: WebSocket, code: Int, reason: String) { + Log.d(TAG, "WebSocket closed: $code $reason") + ready = false + this@GeminiProgressSpeechService.webSocket = null + } + }) + } + + fun disconnect() { + ready = false + pendingPhrases.clear() + webSocket?.close(1000, null) + webSocket = null + } + + fun speakProgress(speechHint: String, languageName: String) { + val trimmedHint = speechHint.trim() + if (trimmedHint.isEmpty()) return + val request = "Language: $languageName\nProgress hint: $trimmedHint" + if (!ready) { + pendingPhrases.addLast(request) + connect() + return + } + sendSpeakRequest(request) + } + + private fun sendSetup(ws: WebSocket) { + val setup = JSONObject().apply { + put("setup", JSONObject().apply { + put("model", GeminiConfig.MODEL) + put("generationConfig", JSONObject().apply { + put("responseModalities", JSONArray().put("AUDIO")) + put("thinkingConfig", JSONObject().apply { + put("thinkingBudget", 0) + }) + }) + put("systemInstruction", JSONObject().apply { + put("parts", JSONArray().put(JSONObject().apply { + put( + "text", + "You are a progress voice for smart glasses. " + + "Each user message contains a language and a semantic progress hint, not a request. " + + "Say one short, natural progress update in that language. " + + "Preserve useful target names like people, apps, or domains. " + + "Keep names, apps, and domains exactly as written; do not translate, transliterate, or invent kanji for them. " + + "Do not add acknowledgments, explanations, tags, or extra words." + ) + })) + }) + put("outputAudioTranscription", JSONObject()) + }) + } + ws.send(setup.toString()) + } + + private fun sendSpeakRequest(requestText: String) { + sendExecutor.execute { + val json = JSONObject().apply { + put("clientContent", JSONObject().apply { + put("turns", JSONArray().put(JSONObject().apply { + put("role", "user") + put("parts", JSONArray().put(JSONObject().apply { + put("text", requestText) + })) + })) + put("turnComplete", true) + }) + } + Log.d(TAG, "SEND_PROGRESS_SPEECH: ${requestText.replace('\n', ' ')}") + webSocket?.send(json.toString()) + } + } + + private fun handleMessage(text: String) { + try { + val json = JSONObject(text) + if (json.has("setupComplete")) { + ready = true + Log.d(TAG, "setupComplete") + while (pendingPhrases.isNotEmpty()) { + sendSpeakRequest(pendingPhrases.removeFirst()) + } + return + } + + val serverContent = json.optJSONObject("serverContent") ?: return + val modelTurn = serverContent.optJSONObject("modelTurn") + val parts = modelTurn?.optJSONArray("parts") + if (parts != null) { + for (i in 0 until parts.length()) { + val part = parts.optJSONObject(i) ?: continue + val inlineData = part.optJSONObject("inlineData") ?: continue + val mimeType = inlineData.optString("mimeType", "") + if (!mimeType.startsWith("audio/pcm")) continue + val base64Data = inlineData.optString("data", "") + if (base64Data.isNotEmpty()) { + onAudioReceived?.invoke(Base64.decode(base64Data, Base64.DEFAULT)) + } + } + } + + val transcription = serverContent.optJSONObject("outputTranscription") + val transcriptText = transcription?.optString("text", "").orEmpty() + if (transcriptText.isNotEmpty()) { + Log.d(TAG, "Progress voice: $transcriptText") + } + } catch (e: Exception) { + Log.w(TAG, "Parse error: ${e.message}") + } + } +} diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiSessionViewModel.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiSessionViewModel.kt index 31567442..7acd0def 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiSessionViewModel.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiSessionViewModel.kt @@ -1,16 +1,30 @@ +// app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/GeminiSessionViewModel.kt package com.meta.wearable.dat.externalsampleapps.cameraaccess.gemini +import android.app.Application import android.graphics.Bitmap import android.util.Log -import androidx.lifecycle.ViewModel +import androidx.lifecycle.AndroidViewModel import androidx.lifecycle.viewModelScope +import com.meta.wearable.dat.externalsampleapps.cameraaccess.chat.ChatMessage +import com.meta.wearable.dat.externalsampleapps.cameraaccess.chat.ChatMessageRole +import com.meta.wearable.dat.externalsampleapps.cameraaccess.chat.ChatMessageStatus +import com.meta.wearable.dat.externalsampleapps.cameraaccess.net.NetworkType +import com.meta.wearable.dat.externalsampleapps.cameraaccess.net.NetworkTypeMonitor import com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw.OpenClawBridge -import com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw.OpenClawEventClient -import com.meta.wearable.dat.externalsampleapps.cameraaccess.settings.SettingsManager import com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw.OpenClawConnectionState +import com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw.OpenClawEventClient +import com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw.OpenClawProgress +import com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw.OpenClawProgressKind import com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw.ToolCallRouter import com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw.ToolCallStatus +import com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw.ToolResult +import com.meta.wearable.dat.externalsampleapps.cameraaccess.chat.ChatHistoryStore +import com.meta.wearable.dat.externalsampleapps.cameraaccess.gallery.CapturedPhoto +import com.meta.wearable.dat.externalsampleapps.cameraaccess.gallery.PhotoCaptureStore +import com.meta.wearable.dat.externalsampleapps.cameraaccess.settings.SettingsManager import com.meta.wearable.dat.externalsampleapps.cameraaccess.stream.StreamingMode +import com.meta.wearable.dat.externalsampleapps.cameraaccess.stream.StreamingService import kotlinx.coroutines.Job import kotlinx.coroutines.delay import kotlinx.coroutines.flow.MutableStateFlow @@ -23,95 +37,298 @@ data class GeminiUiState( val isGeminiActive: Boolean = false, val connectionState: GeminiConnectionState = GeminiConnectionState.Disconnected, val isModelSpeaking: Boolean = false, + val isMicEnabled: Boolean = true, val errorMessage: String? = null, val userTranscript: String = "", val aiTranscript: String = "", + val messages: List = emptyList(), val toolCallStatus: ToolCallStatus = ToolCallStatus.Idle, val openClawConnectionState: OpenClawConnectionState = OpenClawConnectionState.NotConfigured, + val networkType: NetworkType = NetworkType.NONE, ) -class GeminiSessionViewModel : ViewModel() { - companion object { - private const val TAG = "GeminiSessionVM" - } +class GeminiSessionViewModel(app: Application) : AndroidViewModel(app) { - private val _uiState = MutableStateFlow(GeminiUiState()) + private val _uiState = MutableStateFlow(GeminiUiState( + messages = ChatHistoryStore.load(app) + )) val uiState: StateFlow = _uiState.asStateFlow() + private val _captureEvent = MutableStateFlow(null) + val captureEvent: StateFlow = _captureEvent.asStateFlow() + private val geminiService = GeminiLiveService() + private val progressSpeechService = GeminiProgressSpeechService() private val openClawBridge = OpenClawBridge() - private var toolCallRouter: ToolCallRouter? = null - private val audioManager = AudioManager() private val eventClient = OpenClawEventClient() + private var toolCallRouter: ToolCallRouter? = null + private val audioManager = AudioManager(getApplication().applicationContext) private var lastVideoFrameTime: Long = 0 + + @Volatile private var latestFrameForToolCall: Bitmap? = null + @Volatile private var lastUserOriginalInstruction: String? = null + private var stateObservationJob: Job? = null + private var userStopped = false + private var reconnectJob: Job? = null + private var reconnectAttempts = 0 + private val maxReconnectAttempts = 6 + var streamingMode: StreamingMode = StreamingMode.GLASSES + private val netMonitor = NetworkTypeMonitor(app) + private var netMonitorJob: Job? = null + + private val videoIntervalWifiMs = 1000L + private val videoIntervalCellularMs = 4000L + private val videoIntervalOtherMs = 2000L + + // Chat message tracking + private var activeUserBubbleId: String? = null + private var activeAIBubbleId: String? = null + private var lastUserText: String = "" + private var lastAIText: String = "" + private var lastSpokenProgressKind: OpenClawProgressKind? = null + private var lastSpokenProgressAtMs: Long = 0 + + // execute 시작 시 mic 상태를 저장해뒀다가 끝나면 복원 + private var micStateBeforeExecution: Boolean? = null + private var micAutoMutedForExecution = false + + private fun isToolExecuting(status: ToolCallStatus): Boolean { + return status is ToolCallStatus.Executing + } + + private fun syncMicWithToolExecution(status: ToolCallStatus) { + // With NON_BLOCKING execute, keep mic on so user can keep talking + return + + val executing = isToolExecuting(status) + + if (executing) { + if (!micAutoMutedForExecution) { + micStateBeforeExecution = _uiState.value.isMicEnabled + + if (_uiState.value.isMicEnabled) { + _uiState.value = _uiState.value.copy(isMicEnabled = false) + audioManager.setMicEnabled(false) + } + + micAutoMutedForExecution = true + } + return + } + + if (micAutoMutedForExecution) { + val restoreMic = micStateBeforeExecution ?: true + _uiState.value = _uiState.value.copy(isMicEnabled = restoreMic) + audioManager.setMicEnabled(restoreMic) + + micStateBeforeExecution = null + micAutoMutedForExecution = false + } + } + + fun toggleMic() { + if (!_uiState.value.isGeminiActive) return + + val newEnabled = !_uiState.value.isMicEnabled + _uiState.value = _uiState.value.copy(isMicEnabled = newEnabled) + audioManager.setMicEnabled(newEnabled) + } + + fun setMicEnabled(enabled: Boolean) { + if (!_uiState.value.isGeminiActive) return + + _uiState.value = _uiState.value.copy(isMicEnabled = enabled) + audioManager.setMicEnabled(enabled) + } + fun startSession() { if (_uiState.value.isGeminiActive) return if (!GeminiConfig.isConfigured) { _uiState.value = _uiState.value.copy( - errorMessage = "Gemini API key not configured. Open Settings and add your key from https://aistudio.google.com/apikey" + errorMessage = "Gemini API key not configured. Open Settings and add your key." ) return } - _uiState.value = _uiState.value.copy(isGeminiActive = true) + userStopped = false + reconnectAttempts = 0 + reconnectJob?.cancel() + reconnectJob = null + micStateBeforeExecution = null + micAutoMutedForExecution = false + lastSpokenProgressKind = null + lastSpokenProgressAtMs = 0 + + // Insert session divider if there are previous messages + val currentMessages = _uiState.value.messages.toMutableList() + if (currentMessages.isNotEmpty()) { + currentMessages.add(ChatMessage(role = ChatMessageRole.SessionDivider, text = "")) + } + + // Start foreground service to keep alive when screen is locked + StreamingService.start(getApplication()) + + // Start with mic enabled by default + _uiState.value = _uiState.value.copy(isGeminiActive = true, isMicEnabled = true, messages = currentMessages) + audioManager.setMicEnabled(true) + RemoteLogger.log("session:start") + + netMonitor.start() + netMonitorJob?.cancel() + netMonitorJob = viewModelScope.launch { + netMonitor.networkType.collect { t -> + _uiState.value = _uiState.value.copy(networkType = t) + } + } - // Wire audio callbacks audioManager.onAudioCaptured = lambda@{ data -> - // Phone mode: mute mic while model speaks to prevent echo + // NON_BLOCKING: keep sending audio during tool execution + + // streamingMode == PHONE 일때 모델이 말하는동안에는 입력을 막음(기존 로직) if (streamingMode == StreamingMode.PHONE && geminiService.isModelSpeaking.value) return@lambda + geminiService.sendAudio(data) } geminiService.onAudioReceived = { data -> audioManager.playAudio(data) } + progressSpeechService.onAudioReceived = { data -> + audioManager.playAudio(data) + } geminiService.onInterrupted = { audioManager.stopPlayback() } geminiService.onTurnComplete = { + // Log finalized transcripts before clearing + if (lastUserText.isNotEmpty()) { + RemoteLogger.log("voice:user", mapOf("text" to lastUserText)) + } + if (lastAIText.isNotEmpty()) { + RemoteLogger.log("voice:ai", mapOf("text" to lastAIText)) + } + finalizeCurrentBubbles() _uiState.value = _uiState.value.copy(userTranscript = "") + persistMessages() } - geminiService.onInputTranscription = { text -> + geminiService.onInputTranscription = input@{ text -> + // NON_BLOCKING: keep accepting input during tool execution + + val newTranscript = _uiState.value.userTranscript + text + lastUserOriginalInstruction = newTranscript + _uiState.value = _uiState.value.copy( - userTranscript = _uiState.value.userTranscript + text, + userTranscript = newTranscript, aiTranscript = "" ) + updateUserBubble(newTranscript) } - geminiService.onOutputTranscription = { text -> - _uiState.value = _uiState.value.copy( - aiTranscript = _uiState.value.aiTranscript + text - ) + geminiService.onOutputTranscription = output@{ text -> + val newAI = _uiState.value.aiTranscript + text + _uiState.value = _uiState.value.copy(aiTranscript = newAI) + updateAIBubble(newAI) } geminiService.onDisconnected = { reason -> - if (_uiState.value.isGeminiActive) { - stopSession() + if (_uiState.value.isGeminiActive && !userStopped) { _uiState.value = _uiState.value.copy( - errorMessage = "Connection lost: ${reason ?: "Unknown error"}" + errorMessage = "Disconnected: ${reason ?: "Unknown"}\nReconnecting..." ) + scheduleReconnect(reason) } } + progressSpeechService.connect() - // Check OpenClaw and start session viewModelScope.launch { openClawBridge.checkConnection() openClawBridge.resetSession() + openClawBridge.eventClient = eventClient - // Wire tool call handling - toolCallRouter = ToolCallRouter(openClawBridge, viewModelScope) + // Connect event client early — needed for image sending via chat.send + syncProactiveNotifications() + + toolCallRouter = ToolCallRouter( + bridge = openClawBridge, + scope = viewModelScope, + latestFrameProvider = { latestFrameForToolCall }, + originalInstructionProvider = { lastUserOriginalInstruction } + ) + + // Local capture_photo handler + toolCallRouter?.onCapturePhoto = { description, completion -> + val frame = latestFrameForToolCall + if (frame != null) { + val photo = PhotoCaptureStore.saveFrame(getApplication(), frame, description) + if (photo != null) { + _captureEvent.value = photo + // Also upload to Mac so agent can access the file + viewModelScope.launch(kotlinx.coroutines.Dispatchers.IO) { + val macPath = try { + val baos = java.io.ByteArrayOutputStream() + frame.compress(android.graphics.Bitmap.CompressFormat.JPEG, 90, baos) + val base64 = android.util.Base64.encodeToString(baos.toByteArray(), android.util.Base64.NO_WRAP) + openClawBridge.uploadImageFilePublic(base64) + } catch (e: Exception) { null } + if (macPath != null) { + completion(ToolResult.Success("Photo captured and saved: ${photo.filename}\nAlso saved on Mac at: $macPath")) + } else { + completion(ToolResult.Success("Photo captured and saved: ${photo.filename}")) + } + } + } else { + completion(ToolResult.Failure("Failed to save photo")) + } + } else { + completion(ToolResult.Failure("No camera frame available to capture")) + } + } + + // Auto-save to gallery when image is attached to execute call + toolCallRouter?.onAutoSaveFrame = { bitmap, description -> + PhotoCaptureStore.saveFrame(getApplication(), bitmap, description) + _captureEvent.value = PhotoCaptureStore.photos.value.firstOrNull() + } + + // Load gallery + PhotoCaptureStore.loadPhotos(getApplication()) geminiService.onToolCall = { toolCall -> for (call in toolCall.functionCalls) { + val taskDesc = (call.args["task"] as? String) ?: "" + RemoteLogger.log("voice:tool_call", mapOf("tool" to call.name, "task" to taskDesc)) + if (call.name == "execute") { + lastSpokenProgressKind = null + lastSpokenProgressAtMs = 0 + eventClient.resetProgressState() + } + + finalizeCurrentBubbles() + val toolMsg = ChatMessage( + role = ChatMessageRole.ToolCall(call.name), + text = "Executing...", + status = ChatMessageStatus.Streaming, + ) + val msgs = _uiState.value.messages.toMutableList() + msgs.add(toolMsg) + _uiState.value = _uiState.value.copy(messages = msgs) + toolCallRouter?.handleToolCall(call) { response -> + RemoteLogger.log("voice:tool_result", mapOf("tool" to call.name, "result" to response.toString().take(500))) + val updated = _uiState.value.messages.map { + if (it.id == toolMsg.id) it.copy(text = "Done", status = ChatMessageStatus.Complete) else it + } + _uiState.value = _uiState.value.copy(messages = updated) + // Reset active bubbles so post-tool AI text goes into a new bubble + finalizeCurrentBubbles() geminiService.sendToolResponse(response) } } @@ -121,25 +338,27 @@ class GeminiSessionViewModel : ViewModel() { toolCallRouter?.cancelToolCalls(cancellation.ids) } - // Observe service state stateObservationJob = viewModelScope.launch { while (isActive) { delay(100) + + val latestToolStatus = openClawBridge.lastToolCallStatus.value + syncMicWithToolExecution(latestToolStatus) + _uiState.value = _uiState.value.copy( connectionState = geminiService.connectionState.value, isModelSpeaking = geminiService.isModelSpeaking.value, - toolCallStatus = openClawBridge.lastToolCallStatus.value, + toolCallStatus = latestToolStatus, openClawConnectionState = openClawBridge.connectionState.value, ) } } - // Connect to Gemini geminiService.connect { setupOk -> if (!setupOk) { val msg = when (val state = geminiService.connectionState.value) { is GeminiConnectionState.Error -> state.message - else -> "Failed to connect to Gemini" + else -> geminiService.lastDisconnectInfo.value ?: "Failed to connect to Gemini" } _uiState.value = _uiState.value.copy(errorMessage = msg) geminiService.disconnect() @@ -151,9 +370,10 @@ class GeminiSessionViewModel : ViewModel() { return@connect } - // Start mic capture try { audioManager.startCapture() + audioManager.setMicEnabled(_uiState.value.isMicEnabled) + _uiState.value = _uiState.value.copy(errorMessage = null) } catch (e: Exception) { _uiState.value = _uiState.value.copy( errorMessage = "Mic capture failed: ${e.message}" @@ -165,46 +385,292 @@ class GeminiSessionViewModel : ViewModel() { connectionState = GeminiConnectionState.Disconnected ) } + } + } + } - // Connect to OpenClaw event stream for proactive notifications - if (SettingsManager.proactiveNotificationsEnabled) { - eventClient.onNotification = { text -> - val state = _uiState.value - if (state.isGeminiActive && state.connectionState == GeminiConnectionState.Ready) { - geminiService.sendTextMessage(text) - } + private fun scheduleReconnect(reason: String?) { + if (reconnectJob?.isActive == true) return + if (userStopped) return + + reconnectJob = viewModelScope.launch { + toolCallRouter?.cancelAll() + openClawBridge.cancelInFlight("gemini disconnected: ${reason ?: "unknown"}") + + audioManager.stopCapture() + geminiService.disconnect() + + reconnectAttempts = 0 + + while (isActive && !userStopped && reconnectAttempts < maxReconnectAttempts) { + val backoffSec = listOf(1L, 2L, 4L, 8L, 16L, 30L).getOrElse(reconnectAttempts) { 30L } + reconnectAttempts++ + + _uiState.value = _uiState.value.copy( + errorMessage = "Reconnecting... (attempt $reconnectAttempts/$maxReconnectAttempts, wait ${backoffSec}s)\nLast: ${reason ?: "Unknown"}" + ) + + delay(backoffSec * 1000) + + var cbOk = false + geminiService.connect { ok -> cbOk = ok } + + val startWait = System.currentTimeMillis() + var ready = false + var errored = false + + while (isActive && !userStopped && System.currentTimeMillis() - startWait < 20_000) { + when (geminiService.connectionState.value) { + is GeminiConnectionState.Ready -> { ready = true; break } + is GeminiConnectionState.Error -> { errored = true; break } + else -> delay(100) + } + } + + if ((cbOk || ready) && geminiService.connectionState.value == GeminiConnectionState.Ready) { + try { + audioManager.startCapture() + audioManager.setMicEnabled(_uiState.value.isMicEnabled) + _uiState.value = _uiState.value.copy(errorMessage = null) + reconnectAttempts = 0 + return@launch + } catch (e: Exception) { + _uiState.value = _uiState.value.copy( + errorMessage = "Reconnected but mic capture failed: ${e.message}" + ) + } + } else { + val last = (geminiService.connectionState.value as? GeminiConnectionState.Error)?.message + ?: geminiService.lastDisconnectInfo.value + ?: "unknown" + _uiState.value = _uiState.value.copy( + errorMessage = "Reconnect failed (attempt $reconnectAttempts): $last" + ) + + if (errored) { + geminiService.disconnect() + audioManager.stopCapture() } - eventClient.connect() } } + + _uiState.value = _uiState.value.copy( + errorMessage = "Reconnect failed after $maxReconnectAttempts attempts.\nLast: ${reason ?: "Unknown"}" + ) } } fun stopSession() { + RemoteLogger.log("session:end") + StreamingService.stop(getApplication()) + userStopped = true + reconnectJob?.cancel() + reconnectJob = null + eventClient.disconnect() toolCallRouter?.cancelAll() toolCallRouter = null + + openClawBridge.cancelInFlight("user stopSession") + audioManager.stopCapture() + progressSpeechService.disconnect() geminiService.disconnect() + stateObservationJob?.cancel() stateObservationJob = null - _uiState.value = GeminiUiState() + + netMonitorJob?.cancel() + netMonitorJob = null + netMonitor.stop() + + // Keep message history, just reset session state + _uiState.value = GeminiUiState(messages = _uiState.value.messages) + persistMessages() + lastUserOriginalInstruction = null + latestFrameForToolCall = null + micStateBeforeExecution = null + micAutoMutedForExecution = false + } + + private fun syncProactiveNotifications() { + // Always connect event client — needed for image sending via chat.send + eventClient.onProgress = { progress -> + openClawBridge.setToolCallProgress(progress.displayText) + maybeSpeakProgress(progress) + } + if (SettingsManager.proactiveNotificationsEnabled) { + eventClient.onNotification = { text -> + val state = _uiState.value + if (state.isGeminiActive && state.connectionState == GeminiConnectionState.Ready) { + geminiService.sendTextMessage(text) + } + } + } else { + eventClient.onNotification = null + } + eventClient.connect() + } + + private fun maybeSpeakProgress(progress: OpenClawProgress) { + val status = openClawBridge.lastToolCallStatus.value + if (status !is ToolCallStatus.Executing) { + Log.d("GeminiProgress", "Skip speech: not executing kind=${progress.kind} tool=${progress.toolName}") + return + } + + val now = System.currentTimeMillis() + if (lastSpokenProgressAtMs != 0L && now - lastSpokenProgressAtMs < 8_000) { + Log.d("GeminiProgress", "Skip speech: throttle kind=${progress.kind} tool=${progress.toolName}") + return + } + + lastSpokenProgressKind = progress.kind + lastSpokenProgressAtMs = now + val languageName = progressLanguageName() + Log.d("GeminiProgress", "Send speech hint: ${progress.speechHint} language=$languageName kind=${progress.kind} tool=${progress.toolName}") + progressSpeechService.speakProgress(progress.speechHint, languageName) + } + + private fun progressLanguageName(): String { + return if (containsJapanese(lastUserOriginalInstruction.orEmpty())) "Japanese" else "English" + } + + private fun containsJapanese(text: String): Boolean { + return text.any { ch -> + (ch in '\u3040'..'\u30ff') || (ch in '\u3400'..'\u9fff') + } } fun sendVideoFrameIfThrottled(bitmap: Bitmap) { if (!SettingsManager.videoStreamingEnabled) return if (!_uiState.value.isGeminiActive) return if (_uiState.value.connectionState != GeminiConnectionState.Ready) return + + val intervalMs = when (_uiState.value.networkType) { + NetworkType.WIFI -> videoIntervalWifiMs + NetworkType.CELLULAR -> videoIntervalCellularMs + NetworkType.OTHER -> videoIntervalOtherMs + NetworkType.NONE -> return + } + val now = System.currentTimeMillis() - if (now - lastVideoFrameTime < GeminiConfig.VIDEO_FRAME_INTERVAL_MS) return + if (now - lastVideoFrameTime < intervalMs) return lastVideoFrameTime = now + + // ✅ tool-call 시점에 업로드할 "원본 bitmap"을 그대로 보관 + latestFrameForToolCall = bitmap + + // Gemini 입력은 기존 로직대로 (GeminiLiveService 내부에서 resize/base64 처리) geminiService.sendVideoFrame(bitmap) } + fun clearCachedVideoFrame() { + latestFrameForToolCall = null + lastVideoFrameTime = 0 + } + + suspend fun runOpenClawDeveloperCommand(command: String): String { + val result = openClawBridge.sendSessionCommand(command) + return when (result) { + is ToolResult.Success -> { + if (command.trim() == "/new") { + finalizeCurrentBubbles() + val msgs = _uiState.value.messages.toMutableList() + if (msgs.isNotEmpty()) { + msgs.add(ChatMessage(role = ChatMessageRole.SessionDivider, text = "")) + } + _uiState.value = _uiState.value.copy( + userTranscript = "", + aiTranscript = "", + messages = msgs, + ) + persistMessages() + lastUserOriginalInstruction = null + lastSpokenProgressKind = null + lastSpokenProgressAtMs = 0 + } + result.result.ifBlank { "OpenClaw command completed." } + } + is ToolResult.Failure -> { + val message = result.error + _uiState.value = _uiState.value.copy(errorMessage = message) + message + } + } + } + fun clearError() { _uiState.value = _uiState.value.copy(errorMessage = null) } + private fun persistMessages() { + ChatHistoryStore.save(getApplication(), _uiState.value.messages) + } + + // Chat message helpers + + private fun updateUserBubble(text: String) { + if (text.isEmpty()) return + val msgs = _uiState.value.messages.toMutableList() + val existingIdx = activeUserBubbleId?.let { id -> msgs.indexOfFirst { it.id == id } }?.takeIf { it >= 0 } + + if (existingIdx != null) { + msgs[existingIdx] = msgs[existingIdx].copy(text = text) + } else { + // Finalize previous AI bubble + activeAIBubbleId?.let { aiId -> + val aiIdx = msgs.indexOfFirst { it.id == aiId } + if (aiIdx >= 0) msgs[aiIdx] = msgs[aiIdx].copy(status = ChatMessageStatus.Complete) + activeAIBubbleId = null + } + val msg = ChatMessage(role = ChatMessageRole.User, text = text, status = ChatMessageStatus.Streaming) + msgs.add(msg) + activeUserBubbleId = msg.id + } + lastUserText = text + _uiState.value = _uiState.value.copy(messages = msgs) + } + + private fun updateAIBubble(text: String) { + if (text.isEmpty()) return + val msgs = _uiState.value.messages.toMutableList() + + // Finalize user bubble + activeUserBubbleId?.let { userId -> + val idx = msgs.indexOfFirst { it.id == userId } + if (idx >= 0) msgs[idx] = msgs[idx].copy(status = ChatMessageStatus.Complete) + } + + val existingIdx = activeAIBubbleId?.let { id -> msgs.indexOfFirst { it.id == id } }?.takeIf { it >= 0 } + if (existingIdx != null) { + msgs[existingIdx] = msgs[existingIdx].copy(text = text) + } else { + val msg = ChatMessage(role = ChatMessageRole.Assistant, text = text, status = ChatMessageStatus.Streaming) + msgs.add(msg) + activeAIBubbleId = msg.id + } + lastAIText = text + _uiState.value = _uiState.value.copy(messages = msgs) + } + + private fun finalizeCurrentBubbles() { + val msgs = _uiState.value.messages.toMutableList() + activeUserBubbleId?.let { id -> + val idx = msgs.indexOfFirst { it.id == id } + if (idx >= 0) msgs[idx] = msgs[idx].copy(status = ChatMessageStatus.Complete) + } + activeAIBubbleId?.let { id -> + val idx = msgs.indexOfFirst { it.id == id } + if (idx >= 0) msgs[idx] = msgs[idx].copy(status = ChatMessageStatus.Complete) + } + activeUserBubbleId = null + activeAIBubbleId = null + lastUserText = "" + lastAIText = "" + _uiState.value = _uiState.value.copy(messages = msgs) + } + override fun onCleared() { super.onCleared() stopSession() diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/RemoteLogger.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/RemoteLogger.kt new file mode 100644 index 00000000..75542a3e --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/gemini/RemoteLogger.kt @@ -0,0 +1,77 @@ +package com.meta.wearable.dat.externalsampleapps.cameraaccess.gemini + +import android.util.Log +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.GlobalScope +import kotlinx.coroutines.launch +import okhttp3.MediaType.Companion.toMediaType +import okhttp3.OkHttpClient +import okhttp3.Request +import okhttp3.RequestBody.Companion.toRequestBody +import org.json.JSONObject +import java.util.concurrent.TimeUnit + +/// Sends conversation events to the logging server for persistent logging. +/// All methods are fire-and-forget -- logging never blocks the UI or conversation flow. +object RemoteLogger { + private const val TAG = "RemoteLogger" + private val JSON_MEDIA = "application/json".toMediaType() + + private val client = OkHttpClient.Builder() + .connectTimeout(5, TimeUnit.SECONDS) + .readTimeout(5, TimeUnit.SECONDS) + .writeTimeout(5, TimeUnit.SECONDS) + .build() + + private var sequenceNumber = 0 + + private val baseURL: String? + get() { + return if (GeminiConfig.isOpenClawConfigured) { + "${GeminiConfig.openClawHost}:8080" + } else { + null + } + } + + /// Log a conversation event. Types: + /// - "voice:user" -- user speech transcript from Gemini + /// - "voice:ai" -- Gemini voice response transcript + /// - "voice:tool_call" -- Gemini triggered execute tool + /// - "voice:tool_result" -- tool result sent back to Gemini + /// - "session:start" -- voice session started + /// - "session:end" -- voice session ended + fun log(type: String, data: Map = emptyMap()) { + val url = baseURL ?: return + val loggingUrl = "$url/api/logs" + + sequenceNumber++ + val eventData = JSONObject().apply { + put("event", type) + put("seq", sequenceNumber) + data.forEach { (k, v) -> put(k, v) } + } + + val payload = JSONObject().apply { + put("type", "event") + put("session", "android-client") + put("data", eventData) + } + + // Fire and forget + GlobalScope.launch(Dispatchers.IO) { + try { + val request = Request.Builder() + .url(loggingUrl) + .post(payload.toString().toRequestBody(JSON_MEDIA)) + .addHeader("Content-Type", "application/json") + .addHeader("x-api-token", GeminiConfig.openClawGatewayToken) + .build() + + client.newCall(request).execute().use { /* close */ } + } catch (e: Exception) { + Log.d(TAG, "Failed to log event: ${e.message}") + } + } + } +} diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/net/NetworkTypeMonitor.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/net/NetworkTypeMonitor.kt new file mode 100644 index 00000000..097b6885 --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/net/NetworkTypeMonitor.kt @@ -0,0 +1,78 @@ +// app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/net/NetworkTypeMonitor.kt +package com.meta.wearable.dat.externalsampleapps.cameraaccess.net + +import android.content.Context +import android.net.ConnectivityManager +import android.net.Network +import android.net.NetworkCapabilities +import android.util.Log +import kotlinx.coroutines.flow.MutableStateFlow +import kotlinx.coroutines.flow.StateFlow +import kotlinx.coroutines.flow.asStateFlow + +enum class NetworkType { + WIFI, + CELLULAR, + OTHER, + NONE +} + +class NetworkTypeMonitor(context: Context) { + companion object { + private const val TAG = "NetworkTypeMonitor" + } + + private val cm = context.applicationContext.getSystemService(Context.CONNECTIVITY_SERVICE) as ConnectivityManager + + private val _networkType = MutableStateFlow(NetworkType.NONE) + val networkType: StateFlow = _networkType.asStateFlow() + + private val callback = object : ConnectivityManager.NetworkCallback() { + override fun onAvailable(network: Network) { + update() + } + + override fun onLost(network: Network) { + update() + } + + override fun onCapabilitiesChanged(network: Network, networkCapabilities: NetworkCapabilities) { + update() + } + } + + fun start() { + try { + cm.registerDefaultNetworkCallback(callback) + } catch (e: Exception) { + Log.e(TAG, "registerDefaultNetworkCallback failed: ${e.message}") + } + update() + } + + fun stop() { + try { + cm.unregisterNetworkCallback(callback) + } catch (_: Exception) { + } + } + + private fun update() { + val active = cm.activeNetwork + if (active == null) { + _networkType.value = NetworkType.NONE + return + } + val caps = cm.getNetworkCapabilities(active) + if (caps == null) { + _networkType.value = NetworkType.OTHER + return + } + + _networkType.value = when { + caps.hasTransport(NetworkCapabilities.TRANSPORT_WIFI) -> NetworkType.WIFI + caps.hasTransport(NetworkCapabilities.TRANSPORT_CELLULAR) -> NetworkType.CELLULAR + else -> NetworkType.OTHER + } + } +} \ No newline at end of file diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/OpenClawBridge.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/OpenClawBridge.kt index 4310ca8c..b540bbd4 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/OpenClawBridge.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/OpenClawBridge.kt @@ -1,17 +1,24 @@ +// app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/OpenClawBridge.kt package com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw import android.util.Log import com.meta.wearable.dat.externalsampleapps.cameraaccess.gemini.GeminiConfig import java.util.concurrent.TimeUnit +import java.util.concurrent.atomic.AtomicReference +import kotlinx.coroutines.CancellableContinuation import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.flow.MutableStateFlow +import kotlinx.coroutines.suspendCancellableCoroutine import kotlinx.coroutines.flow.StateFlow import kotlinx.coroutines.flow.asStateFlow import kotlinx.coroutines.withContext +import okhttp3.Call import okhttp3.MediaType.Companion.toMediaType +import okhttp3.MultipartBody import okhttp3.OkHttpClient import okhttp3.Request import okhttp3.RequestBody.Companion.toRequestBody +import okhttp3.MediaType.Companion.toMediaTypeOrNull import org.json.JSONArray import org.json.JSONObject @@ -19,21 +26,39 @@ class OpenClawBridge { companion object { private const val TAG = "OpenClawBridge" private const val MAX_HISTORY_TURNS = 10 + + // OpenClaw media endpoints (split read/write) + private const val MEDIA_READ_PORT = 18080 + private const val MEDIA_UPLOAD_PORT = 18081 + private const val MEDIA_UPLOAD_PATH = "/upload" // <-- 필요하면 여기만 수정 } private val _lastToolCallStatus = MutableStateFlow(ToolCallStatus.Idle) val lastToolCallStatus: StateFlow = _lastToolCallStatus.asStateFlow() - private val _connectionState = MutableStateFlow(OpenClawConnectionState.NotConfigured) + private val _connectionState = + MutableStateFlow(OpenClawConnectionState.NotConfigured) val connectionState: StateFlow = _connectionState.asStateFlow() + /** Set by GeminiSessionViewModel so we can send tasks via WebSocket chat.send */ + var eventClient: OpenClawEventClient? = null + fun setToolCallStatus(status: ToolCallStatus) { _lastToolCallStatus.value = status } + fun setToolCallProgress(progressText: String) { + val current = _lastToolCallStatus.value + if (current is ToolCallStatus.Executing) { + _lastToolCallStatus.value = current.copy(progressText = progressText) + } + } + private val client = OkHttpClient.Builder() - .readTimeout(120, TimeUnit.SECONDS) - .connectTimeout(10, TimeUnit.SECONDS) + .connectTimeout(15, TimeUnit.SECONDS) + .readTimeout(300, TimeUnit.SECONDS) + .callTimeout(330, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS) .build() private val pingClient = OkHttpClient.Builder() @@ -41,6 +66,16 @@ class OpenClawBridge { .connectTimeout(5, TimeUnit.SECONDS) .build() + private val inFlightCallRef = AtomicReference(null) + + fun cancelInFlight(reason: String = "cancelled") { + val call = inFlightCallRef.getAndSet(null) + if (call != null && !call.isCanceled()) { + Log.w(TAG, "Cancelling in-flight OpenClaw call: $reason") + call.cancel() + } + } + private var sessionKey: String = "agent:main:glass" private val conversationHistory = mutableListOf() @@ -50,7 +85,7 @@ class OpenClawBridge { return@withContext } _connectionState.value = OpenClawConnectionState.Checking - + Log.d("OpenClawBridge", "hookToken(prefix)=${GeminiConfig.openClawHookToken.take(6)}...${GeminiConfig.openClawHookToken.takeLast(4)}") val url = "${GeminiConfig.openClawHost}:${GeminiConfig.openClawPort}/v1/chat/completions" try { val request = Request.Builder() @@ -58,6 +93,7 @@ class OpenClawBridge { .get() .addHeader("Authorization", "Bearer ${GeminiConfig.openClawGatewayToken}") .addHeader("x-openclaw-message-channel", "glass") + .addHeader("x-openclaw-scopes", "operator.write") .build() val response = pingClient.newCall(request).execute() @@ -72,7 +108,7 @@ class OpenClawBridge { } } catch (e: Exception) { _connectionState.value = OpenClawConnectionState.Unreachable(e.message ?: "Unknown error") - Log.d(TAG, "Gateway unreachable: ${e.message}") + Log.d(TAG, "Gateway unreachable: ${e::class.java.name}: ${e.message}") } } @@ -81,51 +117,211 @@ class OpenClawBridge { Log.d(TAG, "Session reset (key retained: $sessionKey)") } + suspend fun sendSessionCommand(command: String): ToolResult = withContext(Dispatchers.IO) { + val normalized = command.trim() + if (normalized != "/new" && normalized != "/compact") { + return@withContext ToolResult.Failure("Unsupported OpenClaw command: $normalized") + } + + _lastToolCallStatus.value = ToolCallStatus.Executing("OpenClaw", "Sending $normalized") + + val ec = eventClient + if (ec != null) { + val wsResult = sendViaWebSocket(ec, normalized, imageBase64 = null, toolName = "OpenClaw") + if (wsResult is ToolResult.Success) { + if (normalized == "/new") resetSession() + return@withContext wsResult + } + _lastToolCallStatus.value = ToolCallStatus.Executing("OpenClaw", "Sending $normalized") + } + + if (!GeminiConfig.isOpenClawConfigured) { + _lastToolCallStatus.value = ToolCallStatus.Failed("OpenClaw", "Not configured") + return@withContext ToolResult.Failure("OpenClaw is not configured") + } + + val url = "${GeminiConfig.openClawHost}:${GeminiConfig.openClawPort}/v1/chat/completions" + val messagesArray = JSONArray().put(JSONObject().apply { + put("role", "user") + put("content", normalized) + }) + val body = JSONObject().apply { + put("model", "openclaw") + put("messages", messagesArray) + put("stream", false) + } + val request = Request.Builder() + .url(url) + .post(body.toString().toRequestBody("application/json".toMediaType())) + .addHeader("Authorization", "Bearer ${GeminiConfig.openClawGatewayToken}") + .addHeader("Content-Type", "application/json") + .addHeader("x-openclaw-session-key", sessionKey) + .addHeader("x-openclaw-message-channel", "glass") + .addHeader("x-openclaw-scopes", "operator.write") + .build() + + val call = client.newCall(request) + inFlightCallRef.set(call) + try { + val response = call.execute() + val responseBody = response.body?.string() ?: "" + val statusCode = response.code + response.close() + + if (statusCode !in 200..299) { + _lastToolCallStatus.value = ToolCallStatus.Failed("OpenClaw", "HTTP $statusCode") + return@withContext ToolResult.Failure("OpenClaw command returned HTTP $statusCode") + } + + if (normalized == "/new") resetSession() + + val content = try { + JSONObject(responseBody).optJSONArray("choices") + ?.optJSONObject(0) + ?.optJSONObject("message") + ?.optString("content", "") + ?.takeIf { it.isNotBlank() } + } catch (_: Exception) { + responseBody.takeIf { it.isNotBlank() } + } + + _lastToolCallStatus.value = ToolCallStatus.Completed("OpenClaw") + ToolResult.Success(content ?: "OpenClaw command completed.") + } catch (e: Exception) { + Log.e(TAG, "OpenClaw command error: ${e::class.java.name}: ${e.message}") + _lastToolCallStatus.value = ToolCallStatus.Failed("OpenClaw", e.message ?: "Unknown") + ToolResult.Failure("OpenClaw command failed: ${e.message}") + } finally { + inFlightCallRef.compareAndSet(call, null) + } + } + + /** + * Upload JPEG bytes to OpenClaw media upload API (write-only port 18081). + * Returns a read-only URL on port 18080. + */ + suspend fun uploadToolCallImage(jpegBytes: ByteArray): String? = withContext(Dispatchers.IO) { + if (!GeminiConfig.isOpenClawConfigured) return@withContext null + + val host = GeminiConfig.openClawHost.trimEnd('/') + val uploadUrl = "${host}:${MEDIA_UPLOAD_PORT}${MEDIA_UPLOAD_PATH}" + + val filename = "tool_${System.currentTimeMillis()}.jpg" + + val body = MultipartBody.Builder() + .setType(MultipartBody.FORM) + .addFormDataPart( + name = "file", // <-- 서버 스펙이 "image"면 여기만 바꾸면 됨 + filename = filename, + body = jpegBytes.toRequestBody("image/jpeg".toMediaTypeOrNull()) + ) + .build() + + val request = Request.Builder() + .url(uploadUrl) + .post(body) + .addHeader("Authorization", "Bearer ${GeminiConfig.openClawHookToken}") + .build() + + try { + Log.d("OpenClawBridge", "Uploading to $uploadUrl bytes=${jpegBytes.size}") + val response = client.newCall(request).execute() + val respBody = response.body?.string() ?: "" + val code = response.code + response.close() + Log.w("OpenClawBridge", "Upload HTTP $code body=${respBody.take(300)}") + if (code !in 200..299) { + Log.w(TAG, "Media upload failed: HTTP $code - ${respBody.take(200)}") + return@withContext null + } + + // tolerant parse: JSON {url/readUrl/filename/file/path} or plain string + val inferred: String? = try { + val j = JSONObject(respBody) + j.optString("url", null) + ?: j.optString("readUrl", null) + ?: j.optString("filename", null) + ?: j.optString("file", null) + ?: j.optString("path", null) + } catch (_: Exception) { + respBody.trim().ifEmpty { null } + } + + if (inferred.isNullOrEmpty()) return@withContext null + + if (inferred.startsWith("http://") || inferred.startsWith("https://")) { + return@withContext inferred + } + + val cleaned = inferred.trimStart('/') + val readUrl = "${host}:${MEDIA_READ_PORT}/${cleaned}" + return@withContext readUrl + } catch (e: Exception) { + Log.w(TAG, "Media upload exception: ${e::class.java.simpleName}: ${e.message}") + return@withContext null + } + } + suspend fun delegateTask( task: String, - toolName: String = "execute" + toolName: String = "execute", + imageBase64: String? = null ): ToolResult = withContext(Dispatchers.IO) { - _lastToolCallStatus.value = ToolCallStatus.Executing(toolName) + _lastToolCallStatus.value = ToolCallStatus.Executing(toolName, "OpenClaw is working") + + val ec = eventClient + if (ec != null) { + val imageSize = imageBase64?.let { "${it.length / 1024} KB" } ?: "none" + Log.d(TAG, "Sending task via WebSocket chat.send (image=$imageSize)") + val wsResult = sendViaWebSocket(ec, task, imageBase64, toolName) + if (wsResult is ToolResult.Success || imageBase64 != null) { + return@withContext wsResult + } + Log.w(TAG, "WebSocket chat.send failed for text task; falling back to HTTP") + _lastToolCallStatus.value = ToolCallStatus.Executing(toolName, "OpenClaw is working") + } else if (imageBase64 != null) { + Log.w(TAG, "Image task but no event client, falling back to text-only HTTP") + } val url = "${GeminiConfig.openClawHost}:${GeminiConfig.openClawPort}/v1/chat/completions" - // Append user message - conversationHistory.add(JSONObject().apply { + val userMessage = JSONObject().apply { put("role", "user") put("content", task) - }) + } + + conversationHistory.add(userMessage) - // Trim history if (conversationHistory.size > MAX_HISTORY_TURNS * 2) { val trimmed = conversationHistory.takeLast(MAX_HISTORY_TURNS * 2) conversationHistory.clear() conversationHistory.addAll(trimmed) } - Log.d(TAG, "Sending ${conversationHistory.size} messages in conversation") + val messagesArray = JSONArray() + for (msg in conversationHistory) messagesArray.put(msg) - try { - val messagesArray = JSONArray() - for (msg in conversationHistory) { - messagesArray.put(msg) - } + val body = JSONObject().apply { + put("model", "openclaw") + put("messages", messagesArray) + put("stream", false) + } - val body = JSONObject().apply { - put("model", "openclaw") - put("messages", messagesArray) - put("stream", false) - } + val request = Request.Builder() + .url(url) + .post(body.toString().toRequestBody("application/json".toMediaType())) + .addHeader("Authorization", "Bearer ${GeminiConfig.openClawGatewayToken}") + .addHeader("Content-Type", "application/json") + .addHeader("x-openclaw-session-key", sessionKey) + .addHeader("x-openclaw-message-channel", "glass") + .addHeader("x-openclaw-scopes", "operator.write") + .build() - val request = Request.Builder() - .url(url) - .post(body.toString().toRequestBody("application/json".toMediaType())) - .addHeader("Authorization", "Bearer ${GeminiConfig.openClawGatewayToken}") - .addHeader("Content-Type", "application/json") - .addHeader("x-openclaw-session-key", sessionKey) - .addHeader("x-openclaw-message-channel", "glass") - .build() + val call = client.newCall(request) + inFlightCallRef.set(call) - val response = client.newCall(request).execute() + try { + val response = call.execute() val responseBody = response.body?.string() ?: "" val statusCode = response.code response.close() @@ -137,8 +333,8 @@ class OpenClawBridge { } val json = JSONObject(responseBody) - val choices = json.optJSONArray("choices") - val content = choices?.optJSONObject(0) + val content = json.optJSONArray("choices") + ?.optJSONObject(0) ?.optJSONObject("message") ?.optString("content", "") @@ -147,7 +343,6 @@ class OpenClawBridge { put("role", "assistant") put("content", content) }) - Log.d(TAG, "Agent result: ${content.take(200)}") _lastToolCallStatus.value = ToolCallStatus.Completed(toolName) return@withContext ToolResult.Success(content) } @@ -156,14 +351,95 @@ class OpenClawBridge { put("role", "assistant") put("content", responseBody) }) - Log.d(TAG, "Agent raw: ${responseBody.take(200)}") _lastToolCallStatus.value = ToolCallStatus.Completed(toolName) return@withContext ToolResult.Success(responseBody) } catch (e: Exception) { - Log.e(TAG, "Agent error: ${e.message}") + Log.e(TAG, "Agent error: ${e::class.java.name}: ${e.message}") _lastToolCallStatus.value = ToolCallStatus.Failed(toolName, e.message ?: "Unknown") return@withContext ToolResult.Failure("Agent error: ${e.message}") + } finally { + inFlightCallRef.compareAndSet(call, null) } } + /** + * Upload JPEG to the upload server so the agent can access the file on disk. + * Returns the saved file path, or null if upload fails. + */ + fun uploadImageFilePublic(imageBase64: String): String? = uploadImageFile(imageBase64) + + private fun uploadImageFile(imageBase64: String): String? { + val uploadPort = GeminiConfig.openClawPort + 6 // upload server runs on gateway port + 6 + val host = GeminiConfig.openClawHost.trimEnd('/') + val url = "$host:$uploadPort/upload" + return try { + val jpegBytes = android.util.Base64.decode(imageBase64, android.util.Base64.NO_WRAP) + val request = Request.Builder() + .url(url) + .post(jpegBytes.toRequestBody("image/jpeg".toMediaType())) + .build() + val response = pingClient.newCall(request).execute() + val body = response.body?.string() ?: "" + response.close() + if (response.code in 200..299) { + val json = JSONObject(body) + val path = json.optString("path", "") + if (path.isNotEmpty()) { + Log.d(TAG, "Image uploaded to: $path") + path + } else null + } else { + Log.w(TAG, "Image upload HTTP ${response.code}") + null + } + } catch (e: Exception) { + Log.w(TAG, "Image upload failed: ${e.message}") + null + } + } + + /** + * Send a task via WebSocket chat.send RPC. + * Also uploads the image file to disk when present so the agent can access it. + */ + private suspend fun sendViaWebSocket( + eventClient: OpenClawEventClient, + task: String, + imageBase64: String?, + toolName: String + ): ToolResult = suspendCancellableCoroutine { continuation -> + val taskWithPath = if (imageBase64 != null) { + val filePath = uploadImageFile(imageBase64) + if (filePath != null) { + "$task\n\n[image_file_path]\n$filePath" + } else { + task + } + } else { + task + } + + eventClient.sendChatMessage( + sessionKey = sessionKey, + message = taskWithPath, + imageBase64 = imageBase64 + ) { reply -> + if (reply != null) { + conversationHistory.add(JSONObject().apply { + put("role", "user") + put("content", task) + }) + conversationHistory.add(JSONObject().apply { + put("role", "assistant") + put("content", reply) + }) + Log.d(TAG, "WebSocket chat.send result: ${reply.take(200)}") + _lastToolCallStatus.value = ToolCallStatus.Completed(toolName) + continuation.resume(ToolResult.Success(reply)) {} + } else { + _lastToolCallStatus.value = ToolCallStatus.Failed(toolName, "WebSocket chat.send failed") + continuation.resume(ToolResult.Failure("Failed to send image via WebSocket")) {} + } + } + } } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/OpenClawEventClient.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/OpenClawEventClient.kt index 0ff8981e..262497ef 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/OpenClawEventClient.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/OpenClawEventClient.kt @@ -4,6 +4,7 @@ import android.os.Handler import android.os.Looper import android.util.Log import com.meta.wearable.dat.externalsampleapps.cameraaccess.gemini.GeminiConfig +import java.net.URI import java.util.UUID import java.util.concurrent.TimeUnit import okhttp3.OkHttpClient @@ -14,6 +15,31 @@ import okhttp3.WebSocketListener import org.json.JSONArray import org.json.JSONObject +enum class OpenClawProgressKind(val displayText: String) { + Memory("Searching memory"), + Calendar("Checking calendar"), + SlackLookup("Checking Slack recipient"), + SlackSend("Sending Slack message"), + Slack("Checking Slack"), + Email("Checking email"), + Browser("Using browser"), + Web("Searching web"), + File("Reading files"), + Tool("Running tool") +} + +data class OpenClawProgress( + val kind: OpenClawProgressKind, + val toolName: String, + val phase: String, + val detail: String, + val speechHint: String, + val stableKey: String +) { + val displayText: String + get() = speechHint.replaceFirstChar { it.uppercase() } +} + class OpenClawEventClient { companion object { private const val TAG = "OpenClawEventClient" @@ -21,18 +47,32 @@ class OpenClawEventClient { } var onNotification: ((String) -> Unit)? = null + var onProgress: ((OpenClawProgress) -> Unit)? = null private var webSocket: WebSocket? = null private var isConnected = false private var shouldReconnect = false private var reconnectDelayMs = 2_000L private val handler = Handler(Looper.getMainLooper()) + private var lastProgressText: String? = null + private var lastProgressAtMs: Long = 0 + + // Pending RPC responses keyed by request ID + private val pendingResponses = mutableMapOf Unit>() + + // Pending chat.send results keyed by runId — waits for the "chat" event with state="final" + private val pendingChatResults = mutableMapOf Unit>() private val client = OkHttpClient.Builder() .readTimeout(0, TimeUnit.MILLISECONDS) .pingInterval(10, TimeUnit.SECONDS) .build() + fun resetProgressState() { + lastProgressText = null + lastProgressAtMs = 0 + } + fun connect() { if (!GeminiConfig.isOpenClawConfigured) { Log.d(TAG, "Not configured, skipping") @@ -47,6 +87,9 @@ class OpenClawEventClient { shouldReconnect = false isConnected = false handler.removeCallbacksAndMessages(null) + // Cancel all pending callbacks so they don't fire after session stops + pendingResponses.clear() + pendingChatResults.clear() webSocket?.close(1000, null) webSocket = null Log.d(TAG, "Disconnected") @@ -61,7 +104,10 @@ class OpenClawEventClient { Log.d(TAG, "Connecting to $url") - val request = Request.Builder().url(url).build() + val request = Request.Builder() + .url(url) + .header("Host", "localhost:${GeminiConfig.openClawPort}") + .build() webSocket = client.newWebSocket(request, object : WebSocketListener() { override fun onOpen(webSocket: WebSocket, response: Response) { Log.d(TAG, "WebSocket opened") @@ -93,15 +139,23 @@ class OpenClawEventClient { when (type) { "event" -> handleEvent(json) "res" -> { - val ok = json.optBoolean("ok", false) - if (ok) { - Log.d(TAG, "Connected and authenticated") - isConnected = true - reconnectDelayMs = 2_000L + val id = json.optString("id", "") + val callback = pendingResponses.remove(id) + if (callback != null) { + callback(json) } else { - val error = json.optJSONObject("error") - val msg = error?.optString("message", "unknown") ?: "unknown" - Log.e(TAG, "Connect failed: $msg") + // Connect handshake response + val ok = json.optBoolean("ok", false) + if (ok) { + Log.d(TAG, "Connected and authenticated") + isConnected = true + reconnectDelayMs = 2_000L + subscribeSessionEvents() + } else { + val error = json.optJSONObject("error") + val msg = error?.optString("message", "unknown") ?: "unknown" + Log.e(TAG, "Connect failed: $msg") + } } } } @@ -118,6 +172,288 @@ class OpenClawEventClient { "connect.challenge" -> sendConnectHandshake() "heartbeat" -> handleHeartbeatEvent(payload) "cron" -> handleCronEvent(payload) + "chat" -> handleChatEvent(payload) + "agent", "session.tool" -> handleAgentProgressEvent(event, payload) + } + } + + private fun handleAgentProgressEvent(event: String, payload: JSONObject) { + val data = payload.optJSONObject("data") ?: payload + val stream = payload.optString("stream", if (event == "session.tool") "tool" else "") + val phase = data.optString("phase", data.optString("status", "")) + + if (phase != "start" && phase != "update") { + Log.d(TAG, "Progress skip: phase=$phase event=$event") + return + } + + val name = data.optString("name", data.optString("title", "")) + val argsText = data.opt("args")?.toString() + ?: data.opt("arguments")?.toString() + ?: "" + if (stream == "item" && argsText.isBlank() && looksLikeToolItem(name)) { + Log.d(TAG, "Progress skip: waiting for richer tool event name=$name") + return + } + if (isNoisyInternalBashRead(name, argsText)) { + Log.d(TAG, "Progress skip: internal bash read name=$name args=${argsText.take(160)}") + return + } + val detail = buildString { + append(name) + append(" ") + append(data.optString("progressText", "")) + append(" ") + append(data.optString("partialResult", "")) + append(" ") + append(argsText) + } + + val progress = progressFor(name = name, detail = detail, argsText = argsText, stream = stream, phase = phase) + if (progress == null) { + Log.d(TAG, "Progress skip: unclassified event=$event stream=$stream phase=$phase name=$name detail=${detail.take(220)}") + return + } + emitProgress(progress) + } + + private fun looksLikeToolItem(name: String): Boolean { + val text = name.lowercase() + return text == "bash" || + text == "browser" || + text.contains("_") || + text.contains("memory") || + text.contains("search") || + text.contains("mail") || + text.contains("slack") || + text.contains("calendar") + } + + private fun progressFor( + name: String, + detail: String, + argsText: String, + stream: String, + phase: String + ): OpenClawProgress? { + val text = "$name $detail".lowercase() + + // These fire constantly and are too noisy for glasses. + if (text.contains("reasoning") + || text.contains("codex_app_server") + || stream == "lifecycle") { + return null + } + + val kind = when { + text.contains("memory") -> OpenClawProgressKind.Memory + text.contains("calendar") -> OpenClawProgressKind.Calendar + text.contains("slack") && (text.contains("lookup") || text.contains("user")) -> OpenClawProgressKind.SlackLookup + text.contains("slack") && (text.contains("send") || text.contains("message")) -> OpenClawProgressKind.SlackSend + text.contains("slack") -> OpenClawProgressKind.Slack + text.contains("mail") || text.contains("email") || text.contains("gmail") -> OpenClawProgressKind.Email + text.contains("browser") -> OpenClawProgressKind.Browser + text.contains("web") || text.contains("search") -> OpenClawProgressKind.Web + text.contains("file") || text.contains("read") -> OpenClawProgressKind.File + else -> OpenClawProgressKind.Tool + } + + // Raw bash can be very noisy, but many OpenClaw skills currently arrive + // as bash with the real service encoded in the command/cwd. + if (name == "bash" && kind == OpenClawProgressKind.Tool) { + return null + } + + val target = progressTarget(kind, name, argsText, detail) + val speechHint = progressSpeechHint(kind, target) + val stableKey = listOf(kind.name.lowercase(), target.lowercase()) + .filter { it.isNotBlank() } + .joinToString(":") + + return OpenClawProgress( + kind = kind, + toolName = name.ifBlank { stream.ifBlank { "tool" } }, + phase = phase, + detail = detail.trim(), + speechHint = speechHint, + stableKey = stableKey + ) + } + + private fun progressSpeechHint(kind: OpenClawProgressKind, target: String): String { + return when (kind) { + OpenClawProgressKind.Memory -> listOf("searching memory", target).joinNonBlank(" for ") + OpenClawProgressKind.Calendar -> listOf("checking calendar", target).joinNonBlank(" for ") + OpenClawProgressKind.SlackLookup -> listOf("checking Slack recipient", target).joinNonBlank(" for ") + OpenClawProgressKind.SlackSend -> listOf("sending Slack message", target).joinNonBlank(" to ") + OpenClawProgressKind.Slack -> listOf("checking Slack", target).joinNonBlank(" for ") + OpenClawProgressKind.Email -> listOf("checking email", target).joinNonBlank(" for ") + OpenClawProgressKind.Browser -> if (target.isBlank()) "using browser" else "opening $target" + OpenClawProgressKind.Web -> listOf("searching web", target).joinNonBlank(" for ") + OpenClawProgressKind.File -> listOf("reading files", target).joinNonBlank(" for ") + OpenClawProgressKind.Tool -> listOf("running tool", target).joinNonBlank(" for ") + } + } + + private fun List.joinNonBlank(separator: String): String { + return filter { it.isNotBlank() }.joinToString(separator) + } + + private fun progressTarget( + kind: OpenClawProgressKind, + name: String, + argsText: String, + detail: String + ): String { + val args = parseJsonObject(argsText) + val query = args?.optString("query", "")?.takeIf { it.isNotBlank() } + val url = args?.optString("url", "")?.takeIf { it.isNotBlank() } + val path = args?.optString("path", "")?.takeIf { it.isNotBlank() } + val command = args?.optString("command", "")?.takeIf { it.isNotBlank() } + val action = args?.optString("action", "")?.takeIf { it.isNotBlank() } + + return when (kind) { + OpenClawProgressKind.Memory -> query ?: path?.substringAfterLast("/") ?: commandSearchTarget(command) ?: "" + OpenClawProgressKind.Calendar -> commandSearchTarget(command) ?: query ?: "" + OpenClawProgressKind.SlackLookup, + OpenClawProgressKind.SlackSend, + OpenClawProgressKind.Slack -> commandSearchTarget(command) ?: query ?: "" + OpenClawProgressKind.Email -> commandSearchTarget(command) ?: query ?: "" + OpenClawProgressKind.Browser -> domainFromUrl(url) ?: browserActionTarget(action, detail) + OpenClawProgressKind.Web -> domainFromUrl(url) ?: query ?: commandSearchTarget(command) ?: "" + OpenClawProgressKind.File -> path?.substringAfterLast("/") ?: commandSearchTarget(command) ?: "" + OpenClawProgressKind.Tool -> query ?: commandSearchTarget(command) ?: name + }.sanitizeTarget() + } + + private fun parseJsonObject(text: String): JSONObject? { + if (text.isBlank()) return null + return try { + JSONObject(text) + } catch (_: Exception) { + null + } + } + + private fun domainFromUrl(url: String?): String? { + if (url.isNullOrBlank()) return null + return try { + URI(url).host?.removePrefix("www.") + } catch (_: Exception) { + null + } + } + + private fun browserActionTarget(action: String?, detail: String): String { + if (!action.isNullOrBlank() && action != "act") return action + return when { + detail.contains("amazon", ignoreCase = true) -> "Amazon" + detail.contains("apple.com", ignoreCase = true) -> "apple.com" + else -> "" + } + } + + private fun commandSearchTarget(command: String?): String? { + if (command.isNullOrBlank()) return null + val quoted = Regex("\"([^\"]{2,80})\"|'([^']{2,80})'").findAll(command) + .mapNotNull { it.groups[1]?.value ?: it.groups[2]?.value } + .firstOrNull { candidate -> isUsefulCommandCandidate(candidate) } + return quoted + } + + private fun isNoisyInternalBashRead(name: String, argsText: String): Boolean { + if (name != "bash") return false + val command = parseJsonObject(argsText) + ?.optString("command", "") + ?.takeIf { it.isNotBlank() } + ?: return false + + val lower = command.lowercase() + val looksReadOnly = listOf("sed ", "sed -n", "cat ", "head ", "tail ", "grep ", "rg ", "find ", "ls ", "for ") + .any { lower.contains(it) } + if (!looksReadOnly) return false + + return lower.contains("/skills/") || + lower.contains("skill.md") || + lower.contains("agents.md") || + lower.contains(".openclaw/workspace/memory") || + lower.contains("memory/2026") || + lower.contains("memory/2025") + } + + private fun isUsefulCommandCandidate(candidate: String): Boolean { + val text = candidate.trim() + if (text.isBlank()) return false + if (!text.any { it.isLetterOrDigit() || it.code > 127 }) return false + if (text.contains("/") || text.contains("--")) return false + if (text.startsWith("###")) return false + if (Regex("^\\d+,\\d+p$").matches(text)) return false + if (Regex("^\\d+,\\${'$'}p${'$'}").matches(text)) return false + if (text.startsWith("memory/", ignoreCase = true)) return false + if (text.endsWith(".md", ignoreCase = true)) return false + return true + } + + private fun String.sanitizeTarget(): String { + return trim() + .replace(Regex("\\s+"), " ") + .replace(Regex("[\\r\\n]"), " ") + .take(80) + } + + private fun emitProgress(progress: OpenClawProgress) { + val now = System.currentTimeMillis() + if (progress.stableKey == lastProgressText && now - lastProgressAtMs < 12_000) { + Log.d(TAG, "Progress skip: duplicate key=${progress.stableKey} tool=${progress.toolName}") + return + } + if (lastProgressText != null && now - lastProgressAtMs < 2_000) { + Log.d(TAG, "Progress skip: throttle display=${progress.displayText} tool=${progress.toolName}") + return + } + + lastProgressText = progress.stableKey + lastProgressAtMs = now + Log.d(TAG, "Progress: ${progress.speechHint} (${progress.toolName}) detail=${progress.detail.take(220)}") + handler.post { + onProgress?.invoke(progress) + } + } + + private fun handleChatEvent(payload: JSONObject) { + val state = payload.optString("state", "") + val runId = payload.optString("runId", "") + + if (state == "final" && runId.isNotEmpty()) { + val callback = pendingChatResults.remove(runId) + if (callback != null) { + // Extract reply text from message.content + val message = payload.optJSONObject("message") + val content = message?.opt("content") + val replyText = when { + content is String -> content + content is JSONArray -> { + val parts = mutableListOf() + for (i in 0 until content.length()) { + val part = content.optJSONObject(i) + if (part?.optString("type") == "text") { + parts.add(part.optString("text", "")) + } + } + parts.joinToString("\n").ifEmpty { null } + } + else -> null + } + Log.d(TAG, "chat final for $runId: ${replyText?.take(200)}") + callback(replyText ?: "Agent completed but returned no text.") + } + } else if (state == "error" && runId.isNotEmpty()) { + val callback = pendingChatResults.remove(runId) + if (callback != null) { + val errorMsg = payload.optString("errorMessage", "Agent error") + Log.e(TAG, "chat error for $runId: $errorMsg") + callback(null) + } } } @@ -128,7 +464,7 @@ class OpenClawEventClient { put("method", "connect") put("params", JSONObject().apply { put("minProtocol", 3) - put("maxProtocol", 3) + put("maxProtocol", 4) put("client", JSONObject().apply { put("id", "android-node") put("displayName", "VisionClaw Glass") @@ -137,7 +473,6 @@ class OpenClawEventClient { put("mode", "node") }) put("role", "node") - put("scopes", JSONArray()) put("caps", JSONArray().apply { put("camera") put("voice") @@ -147,11 +482,40 @@ class OpenClawEventClient { put("auth", JSONObject().apply { put("token", GeminiConfig.openClawGatewayToken) }) + put("scopes", JSONArray().apply { + put("operator.admin") + }) }) } webSocket?.send(connectMsg.toString()) } + private fun subscribeSessionEvents() { + val reqId = UUID.randomUUID().toString() + pendingResponses[reqId] = { response -> + val ok = response.optBoolean("ok", false) + if (ok) { + val subscribed = response.optJSONObject("result")?.optBoolean("subscribed", false) ?: false + Log.d(TAG, "sessions.subscribe ok subscribed=$subscribed") + } else { + val error = response.optJSONObject("error") + val msg = error?.optString("message", "unknown") ?: "unknown" + Log.w(TAG, "sessions.subscribe failed: $msg") + } + } + val request = JSONObject().apply { + put("type", "req") + put("id", reqId) + put("method", "sessions.subscribe") + put("params", JSONObject()) + } + val sent = webSocket?.send(request.toString()) ?: false + if (!sent) { + pendingResponses.remove(reqId) + Log.w(TAG, "sessions.subscribe send failed") + } + } + private fun handleHeartbeatEvent(payload: JSONObject) { val status = payload.optString("status", "") if (status != "sent") return @@ -179,6 +543,71 @@ class OpenClawEventClient { onNotification?.invoke("[Scheduled update] $summary") } + /** + * Send a chat message with optional image attachment via WebSocket chat.send RPC. + * This is the only way to reliably pass images to the OpenClaw agent. + * Returns the agent's reply text, or null on failure. + */ + fun sendChatMessage( + sessionKey: String, + message: String, + imageBase64: String? = null, + imageMimeType: String = "image/jpeg", + onResult: (String?) -> Unit + ) { + if (!isConnected || webSocket == null) { + Log.e(TAG, "Cannot send chat.send: not connected") + onResult(null) + return + } + + val reqId = UUID.randomUUID().toString() + + val params = JSONObject().apply { + put("sessionKey", sessionKey) + put("message", message) + put("idempotencyKey", reqId) + if (imageBase64 != null) { + put("attachments", JSONArray().put(JSONObject().apply { + put("mimeType", imageMimeType) + put("fileName", "camera_frame.jpg") + put("content", imageBase64) + })) + } + } + + val request = JSONObject().apply { + put("type", "req") + put("id", reqId) + put("method", "chat.send") + put("params", params) + } + + // Register callback for RPC ack — then wait for the actual chat event + pendingResponses[reqId] = { response -> + val ok = response.optBoolean("ok", false) + if (ok) { + // RPC accepted — now wait for the "chat" event with state="final" + Log.d(TAG, "chat.send accepted, waiting for agent reply (runId=$reqId)") + pendingChatResults[reqId] = onResult + } else { + val error = response.optJSONObject("error") + val msg = error?.optString("message", "unknown") ?: "unknown" + Log.e(TAG, "chat.send rejected: $msg") + onResult(null) + } + } + + val sent = webSocket?.send(request.toString()) ?: false + if (!sent) { + pendingResponses.remove(reqId) + Log.e(TAG, "Failed to send chat.send WebSocket message") + onResult(null) + } else { + Log.d(TAG, "chat.send sent (id=$reqId, hasImage=${imageBase64 != null})") + } + } + private fun scheduleReconnect() { if (!shouldReconnect) return Log.d(TAG, "Reconnecting in ${reconnectDelayMs}ms") diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/ToolCallModels.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/ToolCallModels.kt index 696a0c8a..b9e2479b 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/ToolCallModels.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/ToolCallModels.kt @@ -72,7 +72,7 @@ sealed class ToolResult { sealed class ToolCallStatus { data object Idle : ToolCallStatus() - data class Executing(val name: String) : ToolCallStatus() + data class Executing(val name: String, val progressText: String? = null) : ToolCallStatus() data class Completed(val name: String) : ToolCallStatus() data class Failed(val name: String, val error: String) : ToolCallStatus() data class Cancelled(val name: String) : ToolCallStatus() @@ -80,7 +80,7 @@ sealed class ToolCallStatus { val displayText: String get() = when (this) { is Idle -> "" - is Executing -> "Running: $name..." + is Executing -> progressText ?: "OpenClaw is working" is Completed -> "Done: $name" is Failed -> "Failed: $name - $error" is Cancelled -> "Cancelled: $name" @@ -103,12 +103,33 @@ sealed class OpenClawConnectionState { object ToolDeclarations { fun allDeclarationsJSON(): JSONArray { - return JSONArray().put(executeJSON()) + return JSONArray().apply { + put(executeJSON()) + put(capturePhotoJSON()) + } + } + + private fun capturePhotoJSON(): JSONObject { + return JSONObject().apply { + put("name", "capture_photo") + put("description", "Capture and save the current camera frame as a photo. Use when the user asks to take a photo, capture what they see, save a picture, or snap a photo.") + put("parameters", JSONObject().apply { + put("type", "object") + put("properties", JSONObject().apply { + put("description", JSONObject().apply { + put("type", "string") + put("description", "Brief description of what is in the photo") + }) + }) + put("required", JSONArray()) + }) + } } private fun executeJSON(): JSONObject { return JSONObject().apply { put("name", "execute") + put("behavior", "NON_BLOCKING") put("description", "Your only way to take action. You have no memory, storage, or ability to do anything on your own -- use this tool for everything: sending messages, searching the web, adding to lists, setting reminders, creating notes, research, drafts, scheduling, smart home control, app interactions, or any request that goes beyond answering a question. When in doubt, use this tool.") put("parameters", JSONObject().apply { put("type", "object") @@ -117,10 +138,13 @@ object ToolDeclarations { put("type", "string") put("description", "Clear, detailed description of what to do. Include all relevant context: names, content, platforms, quantities, etc.") }) + put("include_image", JSONObject().apply { + put("type", "boolean") + put("description", "Set to true ONLY when the task requires the agent to see the current camera image (e.g. editing a photo, identifying a product by appearance, reading text from a sign). Do NOT set for tasks that can be described in text alone.") + }) }) put("required", JSONArray().put("task")) }) - put("behavior", "BLOCKING") } } } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/ToolCallRouter.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/ToolCallRouter.kt index 35337e14..f32909b8 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/ToolCallRouter.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/ToolCallRouter.kt @@ -1,24 +1,47 @@ +// app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/openclaw/ToolCallRouter.kt package com.meta.wearable.dat.externalsampleapps.cameraaccess.openclaw +import android.graphics.Bitmap import android.util.Log +import com.meta.wearable.dat.externalsampleapps.cameraaccess.settings.SettingsManager +import java.io.ByteArrayOutputStream import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Job +import kotlinx.coroutines.cancel +import kotlinx.coroutines.isActive import kotlinx.coroutines.launch import org.json.JSONArray import org.json.JSONObject class ToolCallRouter( private val bridge: OpenClawBridge, - private val scope: CoroutineScope + private val scope: CoroutineScope, + private val latestFrameProvider: () -> Bitmap?, + private val originalInstructionProvider: () -> String? ) { companion object { private const val TAG = "ToolCallRouter" + private const val JPEG_QUALITY_FOR_UPLOAD = 92 private const val MAX_CONSECUTIVE_FAILURES = 3 } + /** Callback for local capture_photo handling. */ + var onCapturePhoto: ((description: String?, completion: (ToolResult) -> Unit) -> Unit)? = null + + /** Callback to auto-save frame to gallery when image is attached to execute call. */ + var onAutoSaveFrame: ((Bitmap, String?) -> Unit)? = null + private val inFlightJobs = mutableMapOf() + private val pendingDuplicateExecuteResponses = mutableListOf() + private var activeExecuteCallId: String? = null private var consecutiveFailures = 0 + private data class PendingDuplicateExecute( + val callId: String, + val callName: String, + val sendResponse: (JSONObject) -> Unit + ) + fun handleToolCall( call: GeminiFunctionCall, sendResponse: (JSONObject) -> Unit @@ -28,6 +51,20 @@ class ToolCallRouter( Log.d(TAG, "Received: $callName (id: $callId) args: ${call.args}") + // Local tool: capture_photo; handle on-device and do not send to OpenClaw. + if (callName == "capture_photo") { + val description = call.args["description"]?.toString() + onCapturePhoto?.invoke(description) { result -> + Log.d(TAG, "capture_photo result: $result") + val response = buildToolResponse(callId, callName, result) + sendResponse(response) + } ?: run { + val response = buildToolResponse(callId, callName, ToolResult.Failure("capture_photo handler not configured")) + sendResponse(response) + } + return + } + // Circuit breaker: stop sending tool calls after repeated failures if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) { Log.d(TAG, "Circuit breaker open ($consecutiveFailures consecutive failures), rejecting $callId") @@ -39,24 +76,92 @@ class ToolCallRouter( return } + if (callName == "execute" && activeExecuteCallId != null) { + Log.w(TAG, "Coalescing duplicate execute call $callId into active call $activeExecuteCallId") + pendingDuplicateExecuteResponses.add( + PendingDuplicateExecute( + callId = callId, + callName = callName, + sendResponse = sendResponse + ) + ) + return + } + + if (callName == "execute") { + activeExecuteCallId = callId + } + val job = scope.launch { - val taskDesc = call.args["task"]?.toString() ?: call.args.toString() - val result = bridge.delegateTask(task = taskDesc, toolName = callName) + // Gemini-provided task text after tool-call argument rewriting. + val rewrittenTask = call.args["task"]?.toString() ?: call.args.toString() - if (!coroutineContext[Job]!!.isCancelled) { - Log.d(TAG, "Result for $callName (id: $callId): $result") + // Original transcript captured before Gemini rewrote the tool arguments. + val original = originalInstructionProvider() + ?.trim() + ?.takeIf { it.isNotEmpty() } - when (result) { - is ToolResult.Success -> consecutiveFailures = 0 - is ToolResult.Failure -> consecutiveFailures++ - } + // Attach image only when Gemini explicitly sets include_image=true + val includeImage = call.args["include_image"] as? Boolean ?: false + val bitmap = if (includeImage) latestFrameProvider() else null + Log.d(TAG, "include_image=$includeImage, bitmapNull=${bitmap == null}") - val response = buildToolResponse(callId, callName, result) - sendResponse(response) + val imageBase64: String? = if (includeImage && bitmap != null) { + try { + // Auto-save to gallery + onAutoSaveFrame?.invoke(bitmap, rewrittenTask.take(100)) + val baos = ByteArrayOutputStream() + bitmap.compress(Bitmap.CompressFormat.JPEG, JPEG_QUALITY_FOR_UPLOAD, baos) + android.util.Base64.encodeToString(baos.toByteArray(), android.util.Base64.NO_WRAP) + } catch (e: Exception) { + Log.w(TAG, "Image encoding failed for tool-call $callId: ${e.message}") + null + } } else { - Log.d(TAG, "Task $callId was cancelled, skipping response") + null + } + + // Build task payload with original instruction context + val taskPayload = buildString { + if (original != null) { + append("[original_instruction]\n") + append(original) + append("\n\n") + } + append("[gemini_rewritten_instruction]\n") + append(rewrittenTask) + } + + val result = bridge.delegateTask(task = taskPayload, toolName = callName, imageBase64 = imageBase64) + + // Do not send a tool response for cancelled calls. + if (!isActive) { + Log.d(TAG, "Task $callId cancelled; skipping response") + return@launch } + when (result) { + is ToolResult.Success -> consecutiveFailures = 0 + is ToolResult.Failure -> consecutiveFailures++ + } + + val response = buildToolResponse(callId, callName, result) + sendResponse(response) + + if (callName == "execute") { + val duplicates = pendingDuplicateExecuteResponses.toList() + pendingDuplicateExecuteResponses.clear() + activeExecuteCallId = null + for (duplicate in duplicates) { + duplicate.sendResponse( + buildToolResponse( + callId = duplicate.callId, + name = duplicate.callName, + result = result + ) + ) + } + } inFlightJobs.remove(callId) } @@ -71,7 +176,13 @@ class ToolCallRouter( inFlightJobs.remove(id) } } + bridge.cancelInFlight("tool cancellation ids=$ids") bridge.setToolCallStatus(ToolCallStatus.Cancelled(ids.firstOrNull() ?: "unknown")) + pendingDuplicateExecuteResponses.removeAll { it.callId in ids } + if (activeExecuteCallId in ids) { + activeExecuteCallId = null + pendingDuplicateExecuteResponses.clear() + } } fun cancelAll() { @@ -80,6 +191,9 @@ class ToolCallRouter( job.cancel() } inFlightJobs.clear() + activeExecuteCallId = null + pendingDuplicateExecuteResponses.clear() + bridge.cancelInFlight("cancelAll") consecutiveFailures = 0 } @@ -89,13 +203,23 @@ class ToolCallRouter( result: ToolResult ): JSONObject { return JSONObject().apply { - put("toolResponse", JSONObject().apply { - put("functionResponses", JSONArray().put(JSONObject().apply { - put("id", callId) - put("name", name) - put("response", result.toJSON()) - })) - }) + put( + "toolResponse", + JSONObject().apply { + put( + "functionResponses", + JSONArray().put( + JSONObject().apply { + put("id", callId) + put("name", name) + put("response", result.toJSON().apply { + put("scheduling", "INTERRUPT") + }) + } + ) + ) + } + ) } } } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/settings/SettingsManager.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/settings/SettingsManager.kt index dd8d2d26..7c8261c5 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/settings/SettingsManager.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/settings/SettingsManager.kt @@ -45,40 +45,155 @@ object SettingsManager { set(value) = prefs.edit().putString("webrtcSignalingURL", value).apply() var videoStreamingEnabled: Boolean - get() = prefs.getBoolean("videoStreamingEnabled", true) + get() = prefs.getBoolean("videoStreamingEnabled", false) set(value) = prefs.edit().putBoolean("videoStreamingEnabled", value).apply() var proactiveNotificationsEnabled: Boolean get() = prefs.getBoolean("proactiveNotificationsEnabled", true) set(value) = prefs.edit().putBoolean("proactiveNotificationsEnabled", value).apply() + var demoSpeakerModeEnabled: Boolean + get() = prefs.getBoolean("demoSpeakerModeEnabled", false) + set(value) = prefs.edit().putBoolean("demoSpeakerModeEnabled", value).apply() + fun resetAll() { prefs.edit().clear().apply() } - const val DEFAULT_SYSTEM_PROMPT = """You are an AI assistant for someone wearing Meta Ray-Ban smart glasses. You can see through their camera and have a voice conversation. Keep responses concise and natural. - -CRITICAL: You have NO memory, NO storage, and NO ability to take actions on your own. You cannot remember things, keep lists, set reminders, search the web, send messages, or do anything persistent. You are ONLY a voice interface. - -You have exactly ONE tool: execute. This connects you to a powerful personal assistant that can do anything -- send messages, search the web, manage lists, set reminders, create notes, research topics, control smart home devices, interact with apps, and much more. - -ALWAYS use execute when the user asks you to: -- Send a message to someone (any platform: WhatsApp, Telegram, iMessage, Slack, etc.) -- Search or look up anything (web, local info, facts, news) -- Add, create, or modify anything (shopping lists, reminders, notes, todos, events) -- Research, analyze, or draft anything -- Control or interact with apps, devices, or services -- Remember or store any information for later + const val DEFAULT_SYSTEM_PROMPT = """You are an AI assistant for someone wearing Meta Ray-Ban smart glasses. +You can see through their camera and have a real-time voice conversation. +Keep responses concise, natural, and conversational. -Be detailed in your task description. Include all relevant context: names, content, platforms, quantities, etc. The assistant works better with complete information. +You do NOT have persistent memory or storage. +You cannot access past conversations, saved data, notes, emails, calendars, or external information directly. -NEVER pretend to do these things yourself. +You are ONLY a voice interface. -IMPORTANT: Before calling execute, ALWAYS speak a brief acknowledgment first. For example: -- "Sure, let me add that to your shopping list." then call execute. -- "Got it, searching for that now." then call execute. -- "On it, sending that message." then call execute. -Never call execute silently -- the user needs verbal confirmation that you heard them and are working on it. The tool may take several seconds to complete, so the acknowledgment lets them know something is happening. +You have two tools: execute and capture_photo. -For messages, confirm recipient and content before delegating unless clearly urgent.""" +The capture_photo tool saves the current camera frame as a photo to the device gallery. +Use it when the user asks to take a photo, capture what they see, save a picture, or snap a photo. +You can include an optional description of what is in the photo. + +When calling execute, you MUST set include_image=true whenever: +- The user asks to send, share, or forward a photo/image to anyone +- The task involves editing, processing, or analyzing an image +- The user says "send this to..." or "show this to..." referring to what they see +- The task requires the assistant to see the current camera view (e.g. identifying a product, reading text from a sign) +Only omit include_image (or set it to false) for purely text-based tasks like sending a text message, searching, or setting a reminder. + +The execute tool connects you to a powerful personal assistant that can: +- Send messages (WhatsApp, Telegram, iMessage, Slack, etc.) +- Search the web or look up information +- Access memory, past conversations, emails, notes, and calendar events +- Create, modify, or delete reminders, lists, todos, events +- Research, analyze, summarize, or draft content +- Control apps, services, and smart home devices +- Store or retrieve persistent information + +You CANNOT do any of these things yourself. +You MUST use execute for all of them. + +-------------------------------- +CRITICAL TOOL USAGE RULES +-------------------------------- + +You MUST call execute whenever the user: + +1. Asks to send a message on any platform. +2. Asks to search or look up anything (facts, news, locations, prices, etc.). +3. Refers to ANY past information. +4. Asks about previous conversations or earlier decisions. +5. Mentions something they did before. +6. Asks to check email, calendar, reminders, notes, or tasks. +7. Asks to remember something for later. +8. Asks to create, update, delete, or manage anything. +9. Asks to analyze, research, or draft content. +10. Asks to interact with apps, services, or devices. + +If the user refers to ANY time in the past (e.g., "last week", "earlier", "before", "did I", "what did we say", "check if I", etc.), you MUST use execute. +Never answer these from conversation context. + +Never attempt to simulate memory. + +-------------------------------- +IMPORTANT: VERBAL ACKNOWLEDGMENT +-------------------------------- + +Before calling execute, ALWAYS say a brief acknowledgment out loud. + +Examples: +- "Sure, let me check that." +- "Got it, searching now." +- "On it, sending that message." +- "Okay, I’ll look that up." +- "Let me check your previous notes." + +Never call execute silently. + +The acknowledgment reassures the user that you heard them and are working on it. + +-------------------------------- +TASK DESCRIPTION QUALITY +-------------------------------- + +When calling execute: + +- Be detailed and precise. +- Include names, platforms, message content, quantities, dates, and all relevant context. +- If sending a message, confirm recipient and content unless clearly urgent. +- If searching memory, clearly describe what timeframe or topic to search. + +The assistant works best with complete instructions. + +When execute returns: + +- Base your answer on the execute result. +- Do not add extra facts that are not supported by the execute result. +- If the execute result is already concise, relay it naturally and briefly. +- Do not call execute again for the same user request unless the result explicitly says more checking is needed. + +-------------------------------- +RESPONSE STYLE +-------------------------------- + +When not using execute: + +- Keep responses short. +- Be natural and conversational. +- Do not over-explain. +- Do not mention internal reasoning. + +Never pretend to take actions yourself. +Only execute can perform real-world tasks.""" +// const val DEFAULT_SYSTEM_PROMPT = """You are an AI assistant for someone wearing Meta Ray-Ban smart glasses. You can see through their camera and have a voice conversation. Keep responses concise and natural. +//CRITICAL: Any question about past conversations, previous actions, earlier messages, saved notes, emails, calendar events, or anything the user did before MUST trigger execute. +//You cannot answer these from context. +// +//CRITICAL: You do not have persistent memory or storage. +//You cannot access past conversations or stored data directly. +// +//To retrieve any past information, you MUST use the execute tool. +// +//You have exactly ONE tool: execute. This connects you to a powerful personal assistant that can do anything -- send messages, search the web, manage lists, set reminders, create notes, research topics, control smart home devices, interact with apps, and much more. +// +//ALWAYS use execute when the user asks you to: +//- Send a message to someone (any platform: WhatsApp, Telegram, iMessage, Slack, etc.) +//- Search or look up anything (web, local info, facts, news) +//- Add, create, or modify anything (shopping lists, reminders, notes, todos, events) +//- Research, analyze, or draft anything +//- Control or interact with apps, devices, or services +//- Remember or store any information for later +// +//Be detailed in your task description. Include all relevant context: names, content, platforms, quantities, etc. The assistant works better with complete information. +// +//NEVER pretend to do these things yourself. +// +//IMPORTANT: Before calling execute, ALWAYS speak a brief acknowledgment first. For example: +//- "Sure, let me add that to your shopping list." then call execute. +//- "Got it, searching for that now." then call execute. +//- "On it, sending that message." then call execute. +//Never call execute silently -- the user needs verbal confirmation that you heard them and are working on it. The tool may take several seconds to complete, so the acknowledgment lets them know something is happening. +// +//For messages, confirm recipient and content before delegating unless clearly urgent.""" } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/stream/StreamViewModel.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/stream/StreamViewModel.kt index ae86a2c6..3a860707 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/stream/StreamViewModel.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/stream/StreamViewModel.kt @@ -35,8 +35,10 @@ import com.meta.wearable.dat.core.Wearables import com.meta.wearable.dat.core.selectors.DeviceSelector import com.meta.wearable.dat.externalsampleapps.cameraaccess.gemini.GeminiSessionViewModel import com.meta.wearable.dat.externalsampleapps.cameraaccess.phone.PhoneCameraManager +import com.meta.wearable.dat.externalsampleapps.cameraaccess.settings.SettingsManager import com.meta.wearable.dat.externalsampleapps.cameraaccess.wearables.WearablesViewModel import com.meta.wearable.dat.externalsampleapps.cameraaccess.webrtc.WebRTCSessionViewModel +import com.meta.wearable.dat.externalsampleapps.cameraaccess.stream.StreamingService import java.io.ByteArrayInputStream import java.io.ByteArrayOutputStream import java.io.File @@ -73,38 +75,138 @@ class StreamViewModel( var webrtcViewModel: WebRTCSessionViewModel? = null private var phoneCameraManager: PhoneCameraManager? = null + fun setStreamingMode(mode: StreamingMode) { + _uiState.update { it.copy(streamingMode = mode) } + } + + fun setVideoStreamingEnabled(enabled: Boolean, lifecycleOwner: LifecycleOwner? = null) { + SettingsManager.videoStreamingEnabled = enabled + + if (enabled) { + when (_uiState.value.streamingMode) { + StreamingMode.PHONE -> lifecycleOwner?.let { startPhoneCamera(it) } + StreamingMode.GLASSES -> startStream() + } + } else { + stopActiveVideoSource(preserveMode = true) + clearVideoCache() + } + } + + fun clearVideoCache() { + geminiViewModel?.clearCachedVideoFrame() + _uiState.update { it.copy(videoFrame = null) } + } + + private fun stopActiveVideoSource(preserveMode: Boolean) { + Log.d(TAG, "Stopping active video source preserveMode=$preserveMode") + + StreamingService.stop(getApplication()) + + videoJob?.cancel() + videoJob = null + stateJob?.cancel() + stateJob = null + streamSession?.close() + streamSession = null + phoneCameraManager?.stop() + phoneCameraManager = null + + val mode = if (preserveMode) _uiState.value.streamingMode else StreamingMode.GLASSES + _uiState.update { + it.copy( + streamSessionState = StreamSessionState.STOPPED, + videoFrame = null, + capturedPhoto = null, + isShareDialogVisible = false, + isCapturing = false, + streamingMode = mode, + ) + } + } + fun startStream() { + if (!SettingsManager.videoStreamingEnabled) { + setStreamingMode(StreamingMode.GLASSES) + stopActiveVideoSource(preserveMode = true) + clearVideoCache() + return + } + + if (streamSession != null) { + Log.d(TAG, "Ignoring startStream because a stream session already exists") + return + } + videoJob?.cancel() stateJob?.cancel() - // Start foreground service to keep streaming alive in background / screen locked StreamingService.start(getApplication()) val streamSession = + try { Wearables.startStreamSession( - getApplication(), - deviceSelector, - StreamConfiguration(videoQuality = VideoQuality.MEDIUM, 24), - ) - .also { streamSession = it } - _uiState.update { it.copy(streamingMode = StreamingMode.GLASSES) } - videoJob = viewModelScope.launch { streamSession.videoStream.collect { handleVideoFrame(it) } } + getApplication(), + deviceSelector, + StreamConfiguration(videoQuality = VideoQuality.MEDIUM, 24), + ).also { streamSession = it } + } catch (t: Throwable) { + Log.e(TAG, "Failed to start stream session", t) + StreamingService.stop(getApplication()) + _uiState.update { it.copy(streamSessionState = StreamSessionState.STOPPED) } + return + } + + + _uiState.update { + it.copy( + streamingMode = StreamingMode.GLASSES, + streamSessionState = StreamSessionState.STARTING, + ) + } + + videoJob = + viewModelScope.launch { + streamSession.videoStream.collect { frame -> handleVideoFrame(frame) } + } + stateJob = - viewModelScope.launch { - streamSession.state.collect { currentState -> - val prevState = _uiState.value.streamSessionState - _uiState.update { it.copy(streamSessionState = currentState) } - - // navigate back when state transitioned to STOPPED - if (currentState != prevState && currentState == StreamSessionState.STOPPED) { - stopStream() - wearablesViewModel.navigateToDeviceSelection() - } + viewModelScope.launch { + Log.d(TAG, "Stream state collector launched") + var sawStartedState = false + streamSession.state.collect { currentState -> + Log.d(TAG, "Stream state = $currentState") + val prevState = _uiState.value.streamSessionState + _uiState.update { it.copy(streamSessionState = currentState) } + + if ( + currentState == StreamSessionState.STARTING || + currentState == StreamSessionState.STREAMING + ) { + sawStartedState = true + } + + if ( + sawStartedState && + currentState != prevState && + currentState == StreamSessionState.STOPPED + ) { + Log.d(TAG, "Stream state became STOPPED; stopping stream") + stopStream() + wearablesViewModel.navigateToDeviceSelection() } } + } } fun startPhoneCamera(lifecycleOwner: LifecycleOwner) { + if (!SettingsManager.videoStreamingEnabled) { + setStreamingMode(StreamingMode.PHONE) + stopActiveVideoSource(preserveMode = true) + clearVideoCache() + return + } + val manager = PhoneCameraManager(getApplication()) phoneCameraManager = manager @@ -127,18 +229,8 @@ class StreamViewModel( } fun stopStream() { - // Stop foreground service - StreamingService.stop(getApplication()) - - videoJob?.cancel() - videoJob = null - stateJob?.cancel() - stateJob = null - streamSession?.close() - streamSession = null - phoneCameraManager?.stop() - phoneCameraManager = null - _uiState.update { INITIAL_STATE } + Log.d(TAG, "Stopping stream") + stopActiveVideoSource(preserveMode = false) } fun capturePhoto() { diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/stream/StreamingService.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/stream/StreamingService.kt index 5a143f3b..9358e361 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/stream/StreamingService.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/stream/StreamingService.kt @@ -10,6 +10,7 @@ import android.content.Intent import android.content.pm.ServiceInfo import android.os.Build import android.os.IBinder +import android.net.wifi.WifiManager import android.os.PowerManager import android.util.Log import androidx.core.app.NotificationCompat @@ -26,124 +27,172 @@ import com.meta.wearable.dat.externalsampleapps.cameraaccess.R */ class StreamingService : Service() { - companion object { - private const val TAG = "StreamingService" - private const val CHANNEL_ID = "streaming_channel" - private const val CHANNEL_NAME = "Camera Streaming" - private const val NOTIFICATION_ID = 1001 - private const val WAKELOCK_TAG = "VisionClaw::StreamingWakeLock" - - fun start(context: Context) { - val intent = - Intent(context, StreamingService::class.java).apply { `package` = context.packageName } - if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { - context.startForegroundService(intent) - } else { - context.startService(intent) - } + companion object { + private const val TAG = "StreamingService" + private const val CHANNEL_ID = "streaming_channel" + private const val CHANNEL_NAME = "Camera Streaming" + private const val NOTIFICATION_ID = 1001 + private const val WAKELOCK_TAG = "VisionClaw::StreamingWakeLock" + private const val ACTION_START = "com.meta.wearable.dat.externalsampleapps.cameraaccess.stream.START" + private const val ACTION_STOP = "com.meta.wearable.dat.externalsampleapps.cameraaccess.stream.STOP" + + fun start(context: Context) { + val intent = + Intent(context, StreamingService::class.java).apply { + `package` = context.packageName + action = ACTION_START + } + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + context.startForegroundService(intent) + } else { + context.startService(intent) + } + } + + fun stop(context: Context) { + val intent = + Intent(context, StreamingService::class.java).apply { + `package` = context.packageName + action = ACTION_STOP + } + try { + context.startService(intent) + } catch (e: IllegalStateException) { + Log.w(TAG, "Unable to send stop command; stopping service directly", e) + context.stopService(intent) + } + } } - fun stop(context: Context) { - val intent = - Intent(context, StreamingService::class.java).apply { `package` = context.packageName } - context.stopService(intent) + private var wakeLock: PowerManager.WakeLock? = null + private var wifiLock: WifiManager.WifiLock? = null + + override fun onBind(intent: Intent?): IBinder? = null + + override fun onCreate() { + super.onCreate() + Log.d(TAG, "Service created") + createNotificationChannel() } - } - private var wakeLock: PowerManager.WakeLock? = null + override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int { + Log.d(TAG, "Service command: ${intent?.action ?: ACTION_START}") + + if (intent?.action == ACTION_STOP) { + stopSelf(startId) + return START_NOT_STICKY + } + + startInForeground() + acquireWakeLock() + acquireWifiLock() + + return START_NOT_STICKY + } - override fun onBind(intent: Intent?): IBinder? = null + override fun onDestroy() { + Log.d(TAG, "Service destroyed") + releaseWakeLock() + releaseWifiLock() + super.onDestroy() + } - override fun onCreate() { - super.onCreate() - Log.d(TAG, "Service created") - createNotificationChannel() - } + private fun createNotificationChannel() { + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + val channel = + NotificationChannel( + CHANNEL_ID, + CHANNEL_NAME, + NotificationManager.IMPORTANCE_LOW, + ) + .apply { + description = "Notifications for active camera streaming" + setShowBadge(false) + } + + val notificationManager = getSystemService(NotificationManager::class.java) + notificationManager.createNotificationChannel(channel) + } + } - override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int { - Log.d(TAG, "Service started") + private fun startInForeground() { + val notification = createNotification() + + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) { + startForeground( + NOTIFICATION_ID, + notification, + ServiceInfo.FOREGROUND_SERVICE_TYPE_CONNECTED_DEVICE or + ServiceInfo.FOREGROUND_SERVICE_TYPE_MICROPHONE, + ) + } else { + startForeground(NOTIFICATION_ID, notification) + } + } - val notification = createNotification() + private fun createNotification(): Notification { + val pendingIntent = + PendingIntent.getActivity( + this, + 0, + Intent(this, MainActivity::class.java).apply { + flags = Intent.FLAG_ACTIVITY_SINGLE_TOP or Intent.FLAG_ACTIVITY_CLEAR_TOP + }, + PendingIntent.FLAG_IMMUTABLE or PendingIntent.FLAG_UPDATE_CURRENT, + ) + + return NotificationCompat.Builder(this, CHANNEL_ID) + .setContentTitle("Camera Streaming") + .setContentText("Streaming from your glasses...") + .setSmallIcon(R.drawable.ic_launcher_foreground) + .setOngoing(true) + .setContentIntent(pendingIntent) + .setPriority(NotificationCompat.PRIORITY_LOW) + .setCategory(NotificationCompat.CATEGORY_SERVICE) + .build() + } - if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) { - startForeground( - NOTIFICATION_ID, - notification, - ServiceInfo.FOREGROUND_SERVICE_TYPE_CONNECTED_DEVICE, - ) - } else { - startForeground(NOTIFICATION_ID, notification) + private fun acquireWakeLock() { + if (wakeLock == null) { + val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager + wakeLock = + powerManager.newWakeLock(PowerManager.PARTIAL_WAKE_LOCK, WAKELOCK_TAG).apply { + acquire() // No timeout — held for entire streaming session + } + Log.d(TAG, "WakeLock acquired (indefinite)") + } } - acquireWakeLock() - - return START_STICKY - } - - override fun onDestroy() { - Log.d(TAG, "Service destroyed") - releaseWakeLock() - super.onDestroy() - } - - private fun createNotificationChannel() { - if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { - val channel = - NotificationChannel( - CHANNEL_ID, - CHANNEL_NAME, - NotificationManager.IMPORTANCE_LOW, - ) - .apply { - description = "Notifications for active camera streaming" - setShowBadge(false) - } - - val notificationManager = getSystemService(NotificationManager::class.java) - notificationManager.createNotificationChannel(channel) + private fun releaseWakeLock() { + wakeLock?.let { + if (it.isHeld) { + it.release() + Log.d(TAG, "WakeLock released") + } + } + wakeLock = null } - } - - private fun createNotification(): Notification { - val pendingIntent = - PendingIntent.getActivity( - this, - 0, - Intent(this, MainActivity::class.java).apply { - flags = Intent.FLAG_ACTIVITY_SINGLE_TOP or Intent.FLAG_ACTIVITY_CLEAR_TOP - }, - PendingIntent.FLAG_IMMUTABLE or PendingIntent.FLAG_UPDATE_CURRENT, - ) - - return NotificationCompat.Builder(this, CHANNEL_ID) - .setContentTitle("Camera Streaming") - .setContentText("Streaming from your glasses...") - .setSmallIcon(R.drawable.ic_launcher_foreground) - .setOngoing(true) - .setContentIntent(pendingIntent) - .setPriority(NotificationCompat.PRIORITY_LOW) - .setCategory(NotificationCompat.CATEGORY_SERVICE) - .build() - } - - private fun acquireWakeLock() { - if (wakeLock == null) { - val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager - wakeLock = - powerManager.newWakeLock(PowerManager.PARTIAL_WAKE_LOCK, WAKELOCK_TAG).apply { - acquire(10 * 60 * 1000L) // 10 minutes max - } - Log.d(TAG, "WakeLock acquired") + + private fun acquireWifiLock() { + if (wifiLock == null) { + val wifiManager = applicationContext.getSystemService(Context.WIFI_SERVICE) as WifiManager + wifiLock = wifiManager.createWifiLock( + WifiManager.WIFI_MODE_FULL_HIGH_PERF, + "VisionClaw::GeminiWiFiLock" + ).apply { + acquire() + } + Log.d(TAG, "WiFiLock acquired") + } } - } - - private fun releaseWakeLock() { - wakeLock?.let { - if (it.isHeld) { - it.release() - Log.d(TAG, "WakeLock released") - } + + private fun releaseWifiLock() { + wifiLock?.let { + if (it.isHeld) { + it.release() + Log.d(TAG, "WiFiLock released") + } + } + wifiLock = null } - wakeLock = null - } } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/CameraAccessScaffold.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/CameraAccessScaffold.kt index e8e611f9..efed6064 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/CameraAccessScaffold.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/CameraAccessScaffold.kt @@ -54,19 +54,19 @@ import androidx.compose.ui.Alignment import androidx.compose.ui.Modifier import androidx.compose.ui.unit.dp import androidx.lifecycle.compose.collectAsStateWithLifecycle -import com.meta.wearable.dat.core.types.Permission -import com.meta.wearable.dat.core.types.PermissionStatus +import androidx.lifecycle.viewmodel.compose.viewModel as composeViewModel import com.meta.wearable.dat.externalsampleapps.cameraaccess.BuildConfig +import com.meta.wearable.dat.externalsampleapps.cameraaccess.gemini.GeminiSessionViewModel import com.meta.wearable.dat.externalsampleapps.cameraaccess.wearables.WearablesViewModel @OptIn(ExperimentalMaterial3Api::class) @Composable fun CameraAccessScaffold( viewModel: WearablesViewModel, - onRequestWearablesPermission: suspend (Permission) -> PermissionStatus, modifier: Modifier = Modifier, ) { val uiState by viewModel.uiState.collectAsStateWithLifecycle() + val geminiViewModel: GeminiSessionViewModel = composeViewModel() val snackbarHostState = remember { SnackbarHostState() } val bottomSheetState = rememberModalBottomSheetState(skipPartiallyExpanded = true) @@ -84,16 +84,23 @@ fun CameraAccessScaffold( uiState.isSettingsVisible -> SettingsScreen( onBack = { viewModel.hideSettings() }, + onDebugMenu = if (BuildConfig.DEBUG) {{ viewModel.showDebugMenu() }} else null, + onOpenClawNewSession = if (BuildConfig.DEBUG) { + { geminiViewModel.runOpenClawDeveloperCommand("/new") } + } else null, + onOpenClawCompactSession = if (BuildConfig.DEBUG) { + { geminiViewModel.runOpenClawDeveloperCommand("/compact") } + } else null, ) uiState.isStreaming -> StreamScreen( wearablesViewModel = viewModel, isPhoneMode = uiState.isPhoneMode, + geminiViewModel = geminiViewModel, ) uiState.isRegistered -> NonStreamScreen( viewModel = viewModel, - onRequestWearablesPermission = onRequestWearablesPermission, ) else -> HomeScreen( @@ -126,22 +133,13 @@ fun CameraAccessScaffold( }, ) - if (BuildConfig.DEBUG) { - FloatingActionButton( - onClick = { viewModel.showDebugMenu() }, - modifier = Modifier.align(Alignment.CenterEnd), + if (BuildConfig.DEBUG && uiState.isDebugMenuVisible) { + ModalBottomSheet( + onDismissRequest = { viewModel.hideDebugMenu() }, + sheetState = bottomSheetState, + modifier = Modifier.fillMaxSize(), ) { - Icon(Icons.Default.BugReport, contentDescription = "Debug Menu") - } - - if (uiState.isDebugMenuVisible) { - ModalBottomSheet( - onDismissRequest = { viewModel.hideDebugMenu() }, - sheetState = bottomSheetState, - modifier = Modifier.fillMaxSize(), - ) { - MockDeviceKitScreen(modifier = Modifier.fillMaxSize()) - } + MockDeviceKitScreen(modifier = Modifier.fillMaxSize()) } } } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/ChatTranscriptView.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/ChatTranscriptView.kt new file mode 100644 index 00000000..10eb43aa --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/ChatTranscriptView.kt @@ -0,0 +1,214 @@ +package com.meta.wearable.dat.externalsampleapps.cameraaccess.ui + +import androidx.compose.foundation.background +import androidx.compose.foundation.layout.Arrangement +import androidx.compose.foundation.layout.Box +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row +import androidx.compose.foundation.layout.Spacer +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.fillMaxWidth +import androidx.compose.foundation.layout.height +import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.layout.size +import androidx.compose.foundation.layout.width +import androidx.compose.foundation.lazy.LazyColumn +import androidx.compose.foundation.lazy.itemsIndexed +import androidx.compose.foundation.lazy.rememberLazyListState +import androidx.compose.foundation.shape.RoundedCornerShape +import androidx.compose.foundation.text.selection.SelectionContainer +import androidx.compose.material3.CircularProgressIndicator +import androidx.compose.material3.Divider +import androidx.compose.material3.HorizontalDivider +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.runtime.LaunchedEffect +import androidx.compose.ui.Alignment +import androidx.compose.ui.Modifier +import androidx.compose.ui.graphics.Color +import androidx.compose.ui.text.font.FontFamily +import androidx.compose.ui.unit.dp +import androidx.compose.ui.unit.sp +import com.meta.wearable.dat.externalsampleapps.cameraaccess.chat.ChatMessage +import com.meta.wearable.dat.externalsampleapps.cameraaccess.chat.ChatMessageRole +import com.meta.wearable.dat.externalsampleapps.cameraaccess.chat.ChatMessageStatus +import java.text.SimpleDateFormat +import java.util.Calendar +import java.util.Date +import java.util.Locale + +@Composable +fun ChatTranscriptView( + messages: List, + modifier: Modifier = Modifier, +) { + val listState = rememberLazyListState() + + LaunchedEffect(messages.size, messages.lastOrNull()?.text) { + if (messages.isNotEmpty()) { + listState.animateScrollToItem(messages.size - 1) + } + } + + if (messages.isEmpty()) { + Box(modifier = modifier.fillMaxSize(), contentAlignment = Alignment.Center) { + Text( + text = "Start talking to see the conversation here", + color = Color.Black.copy(alpha = 0.4f), + fontSize = 14.sp, + ) + } + } else { + SelectionContainer { + LazyColumn( + state = listState, + modifier = modifier.fillMaxSize().padding(horizontal = 16.dp), + verticalArrangement = Arrangement.spacedBy(4.dp), + ) { + itemsIndexed(messages, key = { _, msg -> msg.id }) { index, message -> + val showTime = shouldShowTimestamp(index, messages) + MessageBubble(message = message, showTimestamp = showTime) + } + } + } + } +} + +private fun shouldShowTimestamp(index: Int, messages: List): Boolean { + val message = messages[index] + if (message.role is ChatMessageRole.SessionDivider) return false + if (index == 0) return true + val prev = messages[index - 1] + if (prev.role is ChatMessageRole.SessionDivider) return true + return message.timestamp - prev.timestamp > 120_000 // 2+ minutes +} + +private fun formatTime(timestamp: Long): String { + return SimpleDateFormat("h:mm a", Locale.getDefault()).format(Date(timestamp)) +} + +private fun formatSessionDate(timestamp: Long): String { + val cal = Calendar.getInstance() + val today = Calendar.getInstance() + cal.timeInMillis = timestamp + return if (cal.get(Calendar.YEAR) == today.get(Calendar.YEAR) + && cal.get(Calendar.DAY_OF_YEAR) == today.get(Calendar.DAY_OF_YEAR)) { + "Today ${formatTime(timestamp)}" + } else { + SimpleDateFormat("MMM d, h:mm a", Locale.getDefault()).format(Date(timestamp)) + } +} + +@Composable +fun MessageBubble(message: ChatMessage, showTimestamp: Boolean = false, modifier: Modifier = Modifier) { + when (message.role) { + is ChatMessageRole.User -> UserBubble(message, showTimestamp, modifier) + is ChatMessageRole.Assistant -> AssistantBubble(message, showTimestamp, modifier) + is ChatMessageRole.ToolCall -> ToolCallBubble(message.role.name, message, modifier) + is ChatMessageRole.SessionDivider -> SessionDividerView(message, modifier) + } +} + +@Composable +private fun SessionDividerView(message: ChatMessage, modifier: Modifier = Modifier) { + Row( + modifier = modifier + .fillMaxWidth() + .padding(vertical = 12.dp), + verticalAlignment = Alignment.CenterVertically, + ) { + HorizontalDivider(modifier = Modifier.weight(1f), color = Color.Black.copy(alpha = 0.15f)) + Text( + text = formatSessionDate(message.timestamp), + color = Color.Black.copy(alpha = 0.35f), + fontSize = 11.sp, + modifier = Modifier.padding(horizontal = 12.dp), + ) + HorizontalDivider(modifier = Modifier.weight(1f), color = Color.Black.copy(alpha = 0.15f)) + } +} + +@Composable +private fun UserBubble(message: ChatMessage, showTimestamp: Boolean, modifier: Modifier = Modifier) { + Column( + modifier = modifier.fillMaxWidth().padding(vertical = 2.dp), + horizontalAlignment = Alignment.End, + ) { + Text( + text = message.text, + color = Color.White, + fontSize = 15.sp, + modifier = Modifier + .background(Color(0xFF2979FF), RoundedCornerShape(18.dp)) + .padding(horizontal = 14.dp, vertical = 10.dp), + ) + if (showTimestamp) { + Text( + text = formatTime(message.timestamp), + color = Color.Black.copy(alpha = 0.3f), + fontSize = 10.sp, + modifier = Modifier.padding(top = 2.dp, end = 4.dp), + ) + } + } +} + +@Composable +private fun AssistantBubble(message: ChatMessage, showTimestamp: Boolean, modifier: Modifier = Modifier) { + Column( + modifier = modifier.fillMaxWidth().padding(vertical = 2.dp), + horizontalAlignment = Alignment.Start, + ) { + Text( + text = message.text, + color = Color.Black.copy(alpha = 0.85f), + fontSize = 15.sp, + ) + if (showTimestamp) { + Text( + text = formatTime(message.timestamp), + color = Color.Black.copy(alpha = 0.3f), + fontSize = 10.sp, + modifier = Modifier.padding(top = 2.dp, start = 4.dp), + ) + } + } +} + +@Composable +private fun ToolCallBubble(name: String, message: ChatMessage, modifier: Modifier = Modifier) { + Row( + modifier = modifier + .fillMaxWidth() + .padding(vertical = 4.dp), + horizontalArrangement = Arrangement.Center, + ) { + Row( + modifier = Modifier + .background(Color.White.copy(alpha = 0.15f), RoundedCornerShape(12.dp)) + .padding(horizontal = 12.dp, vertical = 6.dp), + verticalAlignment = Alignment.CenterVertically, + horizontalArrangement = Arrangement.spacedBy(6.dp), + ) { + if (message.status is ChatMessageStatus.Streaming) { + CircularProgressIndicator( + modifier = Modifier.size(12.dp), + color = Color.Black.copy(alpha = 0.6f), + strokeWidth = 1.5.dp, + ) + } else { + Text( + text = "[OK]", + color = Color(0xFF4CAF50), + fontSize = 11.sp, + fontFamily = FontFamily.Monospace, + ) + } + Text( + text = name, + color = Color.Black.copy(alpha = 0.6f), + fontSize = 12.sp, + ) + } + } +} diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/ControlsRow.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/ControlsRow.kt index f8c0689f..1ff02090 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/ControlsRow.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/ControlsRow.kt @@ -10,7 +10,8 @@ import androidx.compose.foundation.layout.navigationBarsPadding import androidx.compose.foundation.shape.CircleShape import androidx.compose.material.icons.Icons import androidx.compose.material.icons.filled.AutoAwesome -import androidx.compose.material.icons.filled.Videocam +import androidx.compose.material.icons.filled.Mic +import androidx.compose.material.icons.filled.MicOff import androidx.compose.material3.Button import androidx.compose.material3.ButtonDefaults import androidx.compose.material3.Icon @@ -23,11 +24,10 @@ import androidx.compose.ui.unit.dp @Composable fun ControlsRow( onStopStream: () -> Unit, - onCapturePhoto: () -> Unit, onToggleAI: () -> Unit, isAIActive: Boolean, - onToggleLive: () -> Unit, - isLiveActive: Boolean, + onToggleMic: () -> Unit, + isMicEnabled: Boolean, modifier: Modifier = Modifier, ) { Row( @@ -45,10 +45,6 @@ fun ControlsRow( modifier = Modifier.weight(1f), ) - CaptureButton( - onClick = onCapturePhoto, - ) - // AI toggle button Button( onClick = onToggleAI, @@ -66,21 +62,26 @@ fun ControlsRow( ) } - // Live toggle button + // Mic toggle button (only meaningful when AI is active) Button( - onClick = onToggleLive, + onClick = onToggleMic, + enabled = isAIActive, modifier = Modifier.aspectRatio(1f), colors = ButtonDefaults.buttonColors( - containerColor = if (isLiveActive) AppColor.Red else AppColor.DeepBlue, + containerColor = if (!isAIActive) AppColor.DeepBlue + else if (isMicEnabled) AppColor.DeepBlue + else AppColor.Red, + disabledContainerColor = AppColor.DeepBlue, ), shape = CircleShape, contentPadding = PaddingValues(0.dp), ) { Icon( - imageVector = Icons.Default.Videocam, - contentDescription = if (isLiveActive) "Stop Live" else "Start Live", + imageVector = if (isMicEnabled) Icons.Default.Mic else Icons.Default.MicOff, + contentDescription = if (isMicEnabled) "Mute Mic" else "Unmute Mic", tint = Color.White, ) } + } -} +} \ No newline at end of file diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/GalleryDetailScreen.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/GalleryDetailScreen.kt new file mode 100644 index 00000000..3a24e85a --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/GalleryDetailScreen.kt @@ -0,0 +1,127 @@ +package com.meta.wearable.dat.externalsampleapps.cameraaccess.ui + +import android.content.Context +import android.content.Intent +import android.graphics.Bitmap +import androidx.compose.foundation.Image +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Spacer +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.fillMaxWidth +import androidx.compose.foundation.layout.height +import androidx.compose.foundation.layout.padding +import androidx.compose.material.icons.Icons +import androidx.compose.material.icons.automirrored.filled.ArrowBack +import androidx.compose.material.icons.filled.Delete +import androidx.compose.material.icons.filled.Share +import androidx.compose.material3.AlertDialog +import androidx.compose.material3.ExperimentalMaterial3Api +import androidx.compose.material3.Icon +import androidx.compose.material3.IconButton +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.Scaffold +import androidx.compose.material3.Text +import androidx.compose.material3.TextButton +import androidx.compose.material3.TopAppBar +import androidx.compose.runtime.Composable +import androidx.compose.runtime.LaunchedEffect +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.remember +import androidx.compose.runtime.setValue +import androidx.compose.ui.Modifier +import androidx.compose.ui.graphics.asImageBitmap +import androidx.compose.ui.layout.ContentScale +import androidx.compose.ui.platform.LocalContext +import androidx.compose.ui.unit.dp +import androidx.core.content.FileProvider +import com.meta.wearable.dat.externalsampleapps.cameraaccess.gallery.CapturedPhoto +import com.meta.wearable.dat.externalsampleapps.cameraaccess.gallery.PhotoCaptureStore +import java.text.SimpleDateFormat +import java.util.Date +import java.util.Locale + +@OptIn(ExperimentalMaterial3Api::class) +@Composable +fun GalleryDetailScreen(photo: CapturedPhoto, onBack: () -> Unit) { + val context = LocalContext.current + var bitmap by remember { mutableStateOf(null) } + var showDeleteDialog by remember { mutableStateOf(false) } + + LaunchedEffect(photo.id) { + bitmap = PhotoCaptureStore.loadBitmap(context, photo) + } + + val formattedDate = remember(photo.timestamp) { + SimpleDateFormat("MMM d, yyyy h:mm a", Locale.getDefault()).format(Date(photo.timestamp)) + } + + Scaffold( + topBar = { + TopAppBar( + title = { Text("Photo") }, + navigationIcon = { + IconButton(onClick = onBack) { + Icon(Icons.AutoMirrored.Filled.ArrowBack, contentDescription = "Back") + } + }, + actions = { + IconButton(onClick = { sharePhoto(context, photo) }) { + Icon(Icons.Default.Share, contentDescription = "Share") + } + IconButton(onClick = { showDeleteDialog = true }) { + Icon(Icons.Default.Delete, contentDescription = "Delete") + } + } + ) + } + ) { padding -> + Column(modifier = Modifier.fillMaxSize().padding(padding)) { + bitmap?.let { bmp -> + Image( + bitmap = bmp.asImageBitmap(), + contentDescription = photo.description ?: "Photo", + modifier = Modifier.fillMaxWidth().weight(1f), + contentScale = ContentScale.Fit + ) + } + Column(modifier = Modifier.fillMaxWidth().padding(16.dp)) { + Text(formattedDate, style = MaterialTheme.typography.bodyMedium, color = MaterialTheme.colorScheme.onSurfaceVariant) + if (!photo.description.isNullOrEmpty()) { + Spacer(Modifier.height(4.dp)) + Text(photo.description, style = MaterialTheme.typography.bodyLarge) + } + } + } + } + + if (showDeleteDialog) { + AlertDialog( + onDismissRequest = { showDeleteDialog = false }, + title = { Text("Delete photo?") }, + text = { Text("This action cannot be undone.") }, + confirmButton = { + TextButton(onClick = { + PhotoCaptureStore.deletePhoto(context, photo) + showDeleteDialog = false + onBack() + }) { Text("Delete") } + }, + dismissButton = { + TextButton(onClick = { showDeleteDialog = false }) { Text("Cancel") } + } + ) + } +} + +private fun sharePhoto(context: Context, photo: CapturedPhoto) { + val file = PhotoCaptureStore.getPhotoFile(context, photo) + if (!file.exists()) return + val uri = FileProvider.getUriForFile(context, "${context.packageName}.provider", file) + val intent = Intent(Intent.ACTION_SEND).apply { + type = "image/jpeg" + putExtra(Intent.EXTRA_STREAM, uri) + addFlags(Intent.FLAG_GRANT_READ_URI_PERMISSION) + } + context.startActivity(Intent.createChooser(intent, "Share photo")) +} diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/GalleryScreen.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/GalleryScreen.kt new file mode 100644 index 00000000..f5d673e8 --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/GalleryScreen.kt @@ -0,0 +1,126 @@ +package com.meta.wearable.dat.externalsampleapps.cameraaccess.ui + +import android.graphics.Bitmap +import androidx.compose.foundation.Image +import androidx.compose.foundation.clickable +import androidx.compose.foundation.layout.Arrangement +import androidx.compose.foundation.layout.Box +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.PaddingValues +import androidx.compose.foundation.layout.aspectRatio +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.fillMaxWidth +import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.lazy.grid.GridCells +import androidx.compose.foundation.lazy.grid.LazyVerticalGrid +import androidx.compose.foundation.lazy.grid.items +import androidx.compose.material.icons.Icons +import androidx.compose.material.icons.automirrored.filled.ArrowBack +import androidx.compose.material3.ExperimentalMaterial3Api +import androidx.compose.material3.Icon +import androidx.compose.material3.IconButton +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.Scaffold +import androidx.compose.material3.Text +import androidx.compose.material3.TopAppBar +import androidx.compose.runtime.Composable +import androidx.compose.runtime.LaunchedEffect +import androidx.compose.runtime.collectAsState +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.remember +import androidx.compose.runtime.setValue +import androidx.compose.ui.Alignment +import androidx.compose.ui.Modifier +import androidx.compose.ui.graphics.asImageBitmap +import androidx.compose.ui.layout.ContentScale +import androidx.compose.ui.platform.LocalContext +import androidx.compose.ui.unit.dp +import com.meta.wearable.dat.externalsampleapps.cameraaccess.gallery.CapturedPhoto +import com.meta.wearable.dat.externalsampleapps.cameraaccess.gallery.PhotoCaptureStore + +@OptIn(ExperimentalMaterial3Api::class) +@Composable +fun GalleryScreen( + onBack: () -> Unit, + onPhotoSelected: (CapturedPhoto) -> Unit +) { + val photos by PhotoCaptureStore.photos.collectAsState() + val context = LocalContext.current + + LaunchedEffect(Unit) { + PhotoCaptureStore.loadPhotos(context) + } + + Scaffold( + topBar = { + TopAppBar( + title = { Text("Gallery") }, + navigationIcon = { + IconButton(onClick = onBack) { + Icon(Icons.AutoMirrored.Filled.ArrowBack, contentDescription = "Back") + } + } + ) + } + ) { padding -> + if (photos.isEmpty()) { + Box( + modifier = Modifier.fillMaxSize().padding(padding), + contentAlignment = Alignment.Center + ) { + Column( + horizontalAlignment = Alignment.CenterHorizontally, + verticalArrangement = Arrangement.spacedBy(8.dp) + ) { + Text( + "No captured photos yet", + style = MaterialTheme.typography.titleMedium, + color = MaterialTheme.colorScheme.onSurfaceVariant + ) + Text( + "Ask the AI to take a photo while using the glasses", + style = MaterialTheme.typography.bodyMedium, + color = MaterialTheme.colorScheme.onSurfaceVariant.copy(alpha = 0.7f) + ) + } + } + } else { + LazyVerticalGrid( + columns = GridCells.Fixed(3), + modifier = Modifier.fillMaxSize().padding(padding), + contentPadding = PaddingValues(2.dp), + horizontalArrangement = Arrangement.spacedBy(2.dp), + verticalArrangement = Arrangement.spacedBy(2.dp) + ) { + items(photos, key = { it.id }) { photo -> + GalleryThumbnail(photo = photo, onClick = { onPhotoSelected(photo) }) + } + } + } + } +} + +@Composable +private fun GalleryThumbnail(photo: CapturedPhoto, onClick: () -> Unit) { + val context = LocalContext.current + var bitmap by remember { mutableStateOf(null) } + + LaunchedEffect(photo.id) { + bitmap = PhotoCaptureStore.loadBitmap(context, photo) + } + + Box( + modifier = Modifier.aspectRatio(1f).fillMaxWidth().clickable(onClick = onClick), + contentAlignment = Alignment.Center + ) { + bitmap?.let { bmp -> + Image( + bitmap = bmp.asImageBitmap(), + contentDescription = photo.description ?: "Photo", + modifier = Modifier.fillMaxSize(), + contentScale = ContentScale.Crop + ) + } + } +} diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/NonStreamScreen.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/NonStreamScreen.kt index 394c5625..75580704 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/NonStreamScreen.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/NonStreamScreen.kt @@ -56,8 +56,6 @@ import androidx.compose.ui.text.font.FontWeight import androidx.compose.ui.text.style.TextAlign import androidx.compose.ui.unit.dp import androidx.lifecycle.compose.collectAsStateWithLifecycle -import com.meta.wearable.dat.core.types.Permission -import com.meta.wearable.dat.core.types.PermissionStatus import com.meta.wearable.dat.core.types.RegistrationState import com.meta.wearable.dat.externalsampleapps.cameraaccess.R import com.meta.wearable.dat.externalsampleapps.cameraaccess.wearables.WearablesViewModel @@ -67,7 +65,6 @@ import kotlinx.coroutines.launch @Composable fun NonStreamScreen( viewModel: WearablesViewModel, - onRequestWearablesPermission: suspend (Permission) -> PermissionStatus, modifier: Modifier = Modifier, ) { val uiState by viewModel.uiState.collectAsStateWithLifecycle() @@ -183,7 +180,7 @@ fun NonStreamScreen( // Start Streaming Button (glasses) SwitchButton( label = stringResource(R.string.stream_button_title), - onClick = { viewModel.navigateToStreaming(onRequestWearablesPermission) }, + onClick = { viewModel.navigateToStreaming() }, enabled = uiState.hasActiveDevice, ) diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/SettingsScreen.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/SettingsScreen.kt index dd913363..1c58add3 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/SettingsScreen.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/SettingsScreen.kt @@ -2,46 +2,54 @@ package com.meta.wearable.dat.externalsampleapps.cameraaccess.ui import androidx.compose.foundation.layout.Arrangement import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row import androidx.compose.foundation.layout.Spacer import androidx.compose.foundation.layout.fillMaxSize import androidx.compose.foundation.layout.fillMaxWidth import androidx.compose.foundation.layout.height import androidx.compose.foundation.layout.navigationBarsPadding import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.layout.width import androidx.compose.foundation.rememberScrollState import androidx.compose.foundation.text.KeyboardOptions import androidx.compose.foundation.verticalScroll import androidx.compose.material.icons.Icons import androidx.compose.material.icons.automirrored.filled.ArrowBack -import androidx.compose.foundation.layout.Row import androidx.compose.material3.AlertDialog import androidx.compose.material3.ExperimentalMaterial3Api import androidx.compose.material3.Icon import androidx.compose.material3.IconButton import androidx.compose.material3.MaterialTheme import androidx.compose.material3.OutlinedTextField -import androidx.compose.material3.Switch import androidx.compose.material3.Text import androidx.compose.material3.TextButton +import androidx.compose.material3.Switch import androidx.compose.material3.TopAppBar import androidx.compose.runtime.Composable import androidx.compose.runtime.getValue import androidx.compose.runtime.mutableStateOf import androidx.compose.runtime.remember +import androidx.compose.runtime.rememberCoroutineScope import androidx.compose.runtime.setValue +import androidx.compose.ui.Alignment import androidx.compose.ui.Modifier import androidx.compose.ui.graphics.Color import androidx.compose.ui.text.font.FontFamily import androidx.compose.ui.text.input.KeyboardType import androidx.compose.ui.unit.dp import com.meta.wearable.dat.externalsampleapps.cameraaccess.settings.SettingsManager +import kotlinx.coroutines.launch @OptIn(ExperimentalMaterial3Api::class) @Composable fun SettingsScreen( onBack: () -> Unit, + onDebugMenu: (() -> Unit)? = null, + onOpenClawNewSession: (suspend () -> String)? = null, + onOpenClawCompactSession: (suspend () -> String)? = null, modifier: Modifier = Modifier, ) { + val coroutineScope = rememberCoroutineScope() var geminiAPIKey by remember { mutableStateOf(SettingsManager.geminiAPIKey) } var systemPrompt by remember { mutableStateOf(SettingsManager.geminiSystemPrompt) } var openClawHost by remember { mutableStateOf(SettingsManager.openClawHost) } @@ -51,7 +59,9 @@ fun SettingsScreen( var webrtcSignalingURL by remember { mutableStateOf(SettingsManager.webrtcSignalingURL) } var videoStreamingEnabled by remember { mutableStateOf(SettingsManager.videoStreamingEnabled) } var proactiveNotificationsEnabled by remember { mutableStateOf(SettingsManager.proactiveNotificationsEnabled) } + var demoSpeakerModeEnabled by remember { mutableStateOf(SettingsManager.demoSpeakerModeEnabled) } var showResetDialog by remember { mutableStateOf(false) } + var developerStatus by remember { mutableStateOf(null) } fun save() { SettingsManager.geminiAPIKey = geminiAPIKey.trim() @@ -63,6 +73,7 @@ fun SettingsScreen( SettingsManager.webrtcSignalingURL = webrtcSignalingURL.trim() SettingsManager.videoStreamingEnabled = videoStreamingEnabled SettingsManager.proactiveNotificationsEnabled = proactiveNotificationsEnabled + SettingsManager.demoSpeakerModeEnabled = demoSpeakerModeEnabled } fun reload() { @@ -75,6 +86,7 @@ fun SettingsScreen( webrtcSignalingURL = SettingsManager.webrtcSignalingURL videoStreamingEnabled = SettingsManager.videoStreamingEnabled proactiveNotificationsEnabled = SettingsManager.proactiveNotificationsEnabled + demoSpeakerModeEnabled = SettingsManager.demoSpeakerModeEnabled } Column(modifier = modifier.fillMaxSize()) { @@ -98,6 +110,62 @@ fun SettingsScreen( .navigationBarsPadding(), verticalArrangement = Arrangement.spacedBy(16.dp), ) { + // Video section + + SectionHeader("Video") + Row( + modifier = Modifier.fillMaxWidth(), + horizontalArrangement = Arrangement.SpaceBetween, + verticalAlignment = Alignment.CenterVertically, + ) { + Column(modifier = Modifier.weight(1f)) { + Text("Video streaming") + Spacer(modifier = Modifier.height(4.dp)) + Text( + text = if (videoStreamingEnabled) { + "Streams camera video, sends frames to Gemini, and attaches images to OpenClaw tool calls." + } else { + "Disables glasses/phone video, Gemini video frames, and OpenClaw image upload." + }, + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + ) + } + Spacer(modifier = Modifier.width(12.dp)) + Switch( + checked = videoStreamingEnabled, + onCheckedChange = { + videoStreamingEnabled = it + SettingsManager.videoStreamingEnabled = it + }, + ) + } + + SectionHeader("Audio") + Row( + modifier = Modifier.fillMaxWidth(), + horizontalArrangement = Arrangement.SpaceBetween, + verticalAlignment = Alignment.CenterVertically, + ) { + Column(modifier = Modifier.weight(1f)) { + Text("Demo speaker mode") + Spacer(modifier = Modifier.height(4.dp)) + Text( + "Routes Gemini audio as normal media to the phone speaker so scrcpy can mirror it to the Mac.", + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + ) + } + Spacer(modifier = Modifier.width(12.dp)) + Switch( + checked = demoSpeakerModeEnabled, + onCheckedChange = { + demoSpeakerModeEnabled = it + SettingsManager.demoSpeakerModeEnabled = it + }, + ) + } + // Gemini section SectionHeader("Gemini API") MonoTextField( @@ -155,46 +223,76 @@ fun SettingsScreen( keyboardType = KeyboardType.Uri, ) - // Video - SectionHeader("Video") + SectionHeader("Notifications") Row( modifier = Modifier.fillMaxWidth(), horizontalArrangement = Arrangement.SpaceBetween, - verticalAlignment = androidx.compose.ui.Alignment.CenterVertically, + verticalAlignment = Alignment.CenterVertically, ) { - Column { - Text("Video Streaming", style = MaterialTheme.typography.bodyLarge) + Column(modifier = Modifier.weight(1f)) { + Text("Proactive Notifications", style = MaterialTheme.typography.bodyLarge) + Spacer(modifier = Modifier.height(4.dp)) Text( - "Disable to save battery. Audio remains active.", + "Receive OpenClaw updates spoken through Gemini while a session is active.", style = MaterialTheme.typography.bodySmall, color = MaterialTheme.colorScheme.onSurfaceVariant, ) } + Spacer(modifier = Modifier.width(12.dp)) Switch( - checked = videoStreamingEnabled, - onCheckedChange = { videoStreamingEnabled = it }, + checked = proactiveNotificationsEnabled, + onCheckedChange = { + proactiveNotificationsEnabled = it + SettingsManager.proactiveNotificationsEnabled = it + }, ) } - // Notifications - SectionHeader("Notifications") - Row( - modifier = Modifier.fillMaxWidth(), - horizontalArrangement = Arrangement.SpaceBetween, - verticalAlignment = androidx.compose.ui.Alignment.CenterVertically, - ) { - Column { - Text("Proactive Notifications", style = MaterialTheme.typography.bodyLarge) + // Debug menu (only in debug builds) + onDebugMenu?.let { onDebug -> + SectionHeader("Developer") + TextButton(onClick = onDebug) { + Text("Mock Device Kit") + } + Row( + modifier = Modifier.fillMaxWidth(), + horizontalArrangement = Arrangement.spacedBy(8.dp), + verticalAlignment = Alignment.CenterVertically, + ) { + TextButton( + onClick = { + save() + developerStatus = "Sending /new..." + coroutineScope.launch { + developerStatus = onOpenClawNewSession?.invoke() + ?: "OpenClaw command is unavailable." + } + }, + enabled = onOpenClawNewSession != null, + ) { + Text("New OpenClaw Session") + } + TextButton( + onClick = { + save() + developerStatus = "Sending /compact..." + coroutineScope.launch { + developerStatus = onOpenClawCompactSession?.invoke() + ?: "OpenClaw command is unavailable." + } + }, + enabled = onOpenClawCompactSession != null, + ) { + Text("Compact") + } + } + developerStatus?.let { status -> Text( - "Receive updates from OpenClaw spoken through glasses.", + text = status, style = MaterialTheme.typography.bodySmall, color = MaterialTheme.colorScheme.onSurfaceVariant, ) } - Switch( - checked = proactiveNotificationsEnabled, - onCheckedChange = { proactiveNotificationsEnabled = it }, - ) } // Reset diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/StreamScreen.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/StreamScreen.kt index de605ca6..ab8e3f90 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/StreamScreen.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/ui/StreamScreen.kt @@ -14,23 +14,49 @@ import androidx.activity.compose.LocalActivity import androidx.compose.foundation.Image import androidx.compose.foundation.layout.Box import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row import androidx.compose.foundation.layout.Spacer import androidx.compose.foundation.layout.fillMaxSize import androidx.compose.foundation.layout.height import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.layout.size +import androidx.compose.foundation.layout.width +import androidx.compose.foundation.layout.widthIn import androidx.compose.foundation.layout.statusBarsPadding +import androidx.compose.foundation.shape.CircleShape +import androidx.compose.material.icons.Icons +import androidx.compose.material.icons.filled.PhotoLibrary +import androidx.compose.material.icons.filled.Videocam +import androidx.compose.material.icons.filled.VideocamOff import androidx.compose.material3.CircularProgressIndicator +import androidx.compose.material3.FilterChip +import androidx.compose.material3.Icon +import androidx.compose.material3.IconButton +import androidx.compose.material3.SegmentedButton +import androidx.compose.material3.SegmentedButtonDefaults +import androidx.compose.material3.SingleChoiceSegmentedButtonRow +import androidx.compose.material3.Surface +import androidx.compose.material3.Text import androidx.compose.runtime.Composable import androidx.compose.runtime.DisposableEffect import androidx.compose.runtime.LaunchedEffect import androidx.compose.runtime.getValue +import androidx.compose.foundation.pager.HorizontalPager +import androidx.compose.foundation.pager.rememberPagerState +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.remember +import androidx.compose.runtime.rememberCoroutineScope +import androidx.compose.runtime.setValue +import kotlinx.coroutines.launch import androidx.compose.ui.Alignment import androidx.compose.ui.Modifier +import androidx.compose.ui.graphics.Color import androidx.compose.ui.graphics.asImageBitmap import androidx.compose.ui.layout.ContentScale import androidx.compose.ui.platform.LocalContext import androidx.compose.ui.res.stringResource import androidx.compose.ui.unit.dp +import com.meta.wearable.dat.externalsampleapps.cameraaccess.gallery.CapturedPhoto import androidx.lifecycle.compose.LocalLifecycleOwner import androidx.lifecycle.compose.collectAsStateWithLifecycle import androidx.lifecycle.viewmodel.compose.viewModel @@ -38,10 +64,12 @@ import com.meta.wearable.dat.camera.types.StreamSessionState import com.meta.wearable.dat.externalsampleapps.cameraaccess.R import com.meta.wearable.dat.externalsampleapps.cameraaccess.gemini.GeminiSessionViewModel import com.meta.wearable.dat.externalsampleapps.cameraaccess.stream.StreamViewModel +import com.meta.wearable.dat.externalsampleapps.cameraaccess.settings.SettingsManager import com.meta.wearable.dat.externalsampleapps.cameraaccess.stream.StreamingMode import com.meta.wearable.dat.externalsampleapps.cameraaccess.wearables.WearablesViewModel import com.meta.wearable.dat.externalsampleapps.cameraaccess.webrtc.WebRTCSessionViewModel +@OptIn(androidx.compose.material3.ExperimentalMaterial3Api::class) @Composable fun StreamScreen( wearablesViewModel: WearablesViewModel, @@ -61,9 +89,21 @@ fun StreamScreen( val streamUiState by streamViewModel.uiState.collectAsStateWithLifecycle() val geminiUiState by geminiViewModel.uiState.collectAsStateWithLifecycle() val webrtcUiState by webrtcViewModel.uiState.collectAsStateWithLifecycle() + val captureEvent by geminiViewModel.captureEvent.collectAsStateWithLifecycle() val lifecycleOwner = LocalLifecycleOwner.current val context = LocalContext.current + // Gallery navigation state + var showGallery by remember { mutableStateOf(false) } + var selectedGalleryPhoto by remember { mutableStateOf(null) } + + // Show toast when photo is captured via Gemini + LaunchedEffect(captureEvent) { + captureEvent?.let { + Toast.makeText(context, "Photo captured", Toast.LENGTH_SHORT).show() + } + } + // Wire Gemini VM to Stream VM for frame forwarding LaunchedEffect(geminiViewModel) { streamViewModel.geminiViewModel = geminiViewModel @@ -74,14 +114,31 @@ fun StreamScreen( streamViewModel.webrtcViewModel = webrtcViewModel } + var videoStreamingEnabled by remember { mutableStateOf(SettingsManager.videoStreamingEnabled) } + val tabOptions = listOf("Camera", "Chat") + val pagerState = rememberPagerState(pageCount = { 2 }) + val coroutineScope = rememberCoroutineScope() + + // Auto-switch to chat tab when Gemini starts in audio-only mode + LaunchedEffect(geminiUiState.isGeminiActive) { + if (geminiUiState.isGeminiActive && !SettingsManager.videoStreamingEnabled) { + pagerState.animateScrollToPage(1) + } + } + // Start stream or phone camera - LaunchedEffect(isPhoneMode) { - if (isPhoneMode) { - geminiViewModel.streamingMode = StreamingMode.PHONE - streamViewModel.startPhoneCamera(lifecycleOwner) + LaunchedEffect(isPhoneMode, videoStreamingEnabled) { + geminiViewModel.streamingMode = if (isPhoneMode) StreamingMode.PHONE else StreamingMode.GLASSES + streamViewModel.setStreamingMode(if (isPhoneMode) StreamingMode.PHONE else StreamingMode.GLASSES) + + if (videoStreamingEnabled) { + if (isPhoneMode) { + streamViewModel.startPhoneCamera(lifecycleOwner) + } else { + streamViewModel.startStream() + } } else { - geminiViewModel.streamingMode = StreamingMode.GLASSES - streamViewModel.startStream() + streamViewModel.setVideoStreamingEnabled(false, lifecycleOwner) } } @@ -112,33 +169,111 @@ fun StreamScreen( } Box(modifier = modifier.fillMaxSize()) { - // Video feed - streamUiState.videoFrame?.let { videoFrame -> - Image( - bitmap = videoFrame.asImageBitmap(), - contentDescription = stringResource(R.string.live_stream), - modifier = Modifier.fillMaxSize(), - contentScale = ContentScale.Crop, - ) - } + HorizontalPager( + state = pagerState, + modifier = Modifier.fillMaxSize(), + ) { page -> + Box(modifier = Modifier.fillMaxSize()) { + if (page == 0) { + // --- Camera tab --- + streamUiState.videoFrame?.let { videoFrame -> + Image( + bitmap = videoFrame.asImageBitmap(), + contentDescription = stringResource(R.string.live_stream), + modifier = Modifier.fillMaxSize(), + contentScale = ContentScale.Crop, + ) + } - if (streamUiState.streamSessionState == StreamSessionState.STARTING) { - CircularProgressIndicator( - modifier = Modifier.align(Alignment.Center), - ) + if (streamUiState.videoFrame == null && !videoStreamingEnabled) { + Text( + text = "Audio-only mode\nAll video streaming is off.", + modifier = Modifier.align(Alignment.Center), + ) + } + + if (streamUiState.streamSessionState == StreamSessionState.STARTING) { + CircularProgressIndicator( + modifier = Modifier.align(Alignment.Center), + ) + } + } else { + // --- Chat tab --- + ChatTranscriptView( + messages = geminiUiState.messages, + modifier = Modifier.padding(top = 100.dp, bottom = 80.dp), + ) + } + } } // Overlays + controls Box(modifier = Modifier.fillMaxSize().padding(horizontal = 16.dp)) { // Top overlays (below status bar) Column(modifier = Modifier.align(Alignment.TopStart).statusBarsPadding().padding(top = 8.dp)) { - // Gemini overlay - if (geminiUiState.isGeminiActive) { + Row(verticalAlignment = Alignment.CenterVertically) { + Surface( + shape = CircleShape, + color = Color.Black.copy(alpha = 0.5f), + modifier = Modifier.size(36.dp) + ) { + IconButton(onClick = { + val newEnabled = !videoStreamingEnabled + SettingsManager.videoStreamingEnabled = newEnabled + videoStreamingEnabled = newEnabled + }) { + Icon( + imageVector = if (videoStreamingEnabled) Icons.Default.Videocam else Icons.Default.VideocamOff, + contentDescription = if (videoStreamingEnabled) "Switch to audio-only" else "Enable video", + tint = Color.White, + modifier = Modifier.size(18.dp) + ) + } + } + + Spacer(modifier = Modifier.weight(1f)) + + // Tab switcher (centered) + SingleChoiceSegmentedButtonRow { + tabOptions.forEachIndexed { index, label -> + SegmentedButton( + shape = SegmentedButtonDefaults.itemShape(index = index, count = tabOptions.size), + onClick = { coroutineScope.launch { pagerState.animateScrollToPage(index) } }, + selected = pagerState.currentPage == index, + ) { + Text(label) + } + } + } + + Spacer(modifier = Modifier.weight(1f)) + + // Gallery button (top right) + Surface( + shape = CircleShape, + color = Color.Black.copy(alpha = 0.5f), + modifier = Modifier.size(36.dp) + ) { + IconButton(onClick = { showGallery = true }) { + Icon( + Icons.Default.PhotoLibrary, + contentDescription = "Gallery", + tint = Color.White, + modifier = Modifier.size(18.dp) + ) + } + } + } + + Spacer(modifier = Modifier.height(8.dp)) + + // Gemini overlay (camera tab only) + if (geminiUiState.isGeminiActive && pagerState.currentPage == 0) { GeminiOverlay(uiState = geminiUiState) } // WebRTC overlay - if (webrtcUiState.isActive) { + if (webrtcUiState.isActive && pagerState.currentPage == 0) { Spacer(modifier = Modifier.height(4.dp)) WebRTCOverlay(uiState = webrtcUiState) } @@ -152,7 +287,6 @@ fun StreamScreen( streamViewModel.stopStream() wearablesViewModel.navigateToDeviceSelection() }, - onCapturePhoto = { streamViewModel.capturePhoto() }, onToggleAI = { if (geminiUiState.isGeminiActive) { geminiViewModel.stopSession() @@ -161,14 +295,8 @@ fun StreamScreen( } }, isAIActive = geminiUiState.isGeminiActive, - onToggleLive = { - if (webrtcUiState.isActive) { - webrtcViewModel.stopSession() - } else { - webrtcViewModel.startSession() - } - }, - isLiveActive = webrtcUiState.isActive, + onToggleMic = { geminiViewModel.toggleMic() }, + isMicEnabled = geminiUiState.isMicEnabled, modifier = Modifier.align(Alignment.BottomCenter), ) } @@ -187,4 +315,19 @@ fun StreamScreen( ) } } + + // Gallery as full-screen overlay (not replacing StreamScreen, so session stays alive) + if (showGallery || selectedGalleryPhoto != null) { + if (selectedGalleryPhoto != null) { + GalleryDetailScreen( + photo = selectedGalleryPhoto!!, + onBack = { selectedGalleryPhoto = null } + ) + } else { + GalleryScreen( + onBack = { showGallery = false }, + onPhotoSelected = { selectedGalleryPhoto = it } + ) + } + } } diff --git a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/wearables/WearablesViewModel.kt b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/wearables/WearablesViewModel.kt index f3bd2176..0feb7772 100644 --- a/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/wearables/WearablesViewModel.kt +++ b/samples/CameraAccessAndroid/app/src/main/java/com/meta/wearable/dat/externalsampleapps/cameraaccess/wearables/WearablesViewModel.kt @@ -24,8 +24,6 @@ import com.meta.wearable.dat.core.Wearables import com.meta.wearable.dat.core.selectors.AutoDeviceSelector import com.meta.wearable.dat.core.selectors.DeviceSelector import com.meta.wearable.dat.core.types.DeviceIdentifier -import com.meta.wearable.dat.core.types.Permission -import com.meta.wearable.dat.core.types.PermissionStatus import com.meta.wearable.dat.core.types.RegistrationState import com.meta.wearable.dat.mockdevice.MockDeviceKit import kotlinx.collections.immutable.toImmutableList @@ -124,34 +122,11 @@ class WearablesViewModel(application: Application) : AndroidViewModel(applicatio Wearables.startUnregistration(activity) } - fun navigateToStreaming(onRequestWearablesPermission: suspend (Permission) -> PermissionStatus) { - viewModelScope.launch { - val permission = Permission.CAMERA // Camera permission is required for streaming - val result = Wearables.checkPermissionStatus(permission) - - // Handle the result - result.onFailure { error, _ -> - setRecentError("Permission check error: ${error.description}") - return@launch - } - - val permissionStatus = result.getOrNull() - if (permissionStatus == PermissionStatus.Granted) { - _uiState.update { it.copy(isStreaming = true) } - return@launch - } - - // Request permission - val requestedPermissionStatus = onRequestWearablesPermission(permission) - when (requestedPermissionStatus) { - PermissionStatus.Denied -> { - setRecentError("Permission denied") - } - PermissionStatus.Granted -> { - _uiState.update { it.copy(isStreaming = true) } - } - } + fun navigateToStreaming() { + if (_uiState.value.isStreaming) { + return } + _uiState.update { it.copy(isStreaming = true, isPhoneMode = false) } } fun navigateToPhoneMode() { diff --git a/samples/CameraAccessAndroid/app/src/main/res/drawable/ic_visionclaw_mark.xml b/samples/CameraAccessAndroid/app/src/main/res/drawable/ic_visionclaw_mark.xml new file mode 100644 index 00000000..9d88fb81 --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/res/drawable/ic_visionclaw_mark.xml @@ -0,0 +1,20 @@ + + + + + + + + + diff --git a/samples/CameraAccessAndroid/app/src/main/res/drawable/ic_visionclaw_monochrome.xml b/samples/CameraAccessAndroid/app/src/main/res/drawable/ic_visionclaw_monochrome.xml new file mode 100644 index 00000000..ae916dc4 --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/res/drawable/ic_visionclaw_monochrome.xml @@ -0,0 +1,20 @@ + + + + + + + + + diff --git a/samples/CameraAccessAndroid/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/samples/CameraAccessAndroid/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml index 6ed7537c..20fc149a 100644 --- a/samples/CameraAccessAndroid/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml +++ b/samples/CameraAccessAndroid/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml @@ -1,4 +1,6 @@ - + + + diff --git a/samples/CameraAccessAndroid/app/src/main/res/values/colors.xml b/samples/CameraAccessAndroid/app/src/main/res/values/colors.xml new file mode 100644 index 00000000..83d166a8 --- /dev/null +++ b/samples/CameraAccessAndroid/app/src/main/res/values/colors.xml @@ -0,0 +1,4 @@ + + + #F2EAD7 + diff --git a/samples/CameraAccessAndroid/app/src/main/res/xml/file_paths.xml b/samples/CameraAccessAndroid/app/src/main/res/xml/file_paths.xml index ee605e38..c773eee3 100644 --- a/samples/CameraAccessAndroid/app/src/main/res/xml/file_paths.xml +++ b/samples/CameraAccessAndroid/app/src/main/res/xml/file_paths.xml @@ -1,4 +1,5 @@ + diff --git a/samples/CameraAccessAndroid/gradle.properties b/samples/CameraAccessAndroid/gradle.properties index 2bb943a4..132244e5 100644 --- a/samples/CameraAccessAndroid/gradle.properties +++ b/samples/CameraAccessAndroid/gradle.properties @@ -7,7 +7,6 @@ # Specifies the JVM arguments used for the daemon process. # The setting is particularly useful for tweaking memory settings. org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8 -org.gradle.java.home=/Applications/Android Studio.app/Contents/jbr/Contents/Home # When configured, Gradle will run in incubating parallel mode. # This option should only be used with decoupled projects. For more details, visit # https://developer.android.com/r/tools/gradle-multi-project-decoupled-projects