From c95e8d7c91a933b73236d87d64074e0441f7cab4 Mon Sep 17 00:00:00 2001 From: aasthabharill Date: Thu, 7 May 2026 16:58:08 +0530 Subject: [PATCH 1/6] setup test --- .github/workflows/spanner-load-tests.yml | 3 +- .../cloudsql}/CloudSqlShardOrchestrator.java | 66 ++-- .../CloudSqlShardOrchestratorTest.java | 44 ++- .../MySQLMultiSharded1024ShardsLT.java | 6 +- .../PostgreSQLMultiSharded1024ShardsLT.java | 6 +- .../SpannerToSourceDbBacklogStepLT.java | 273 +++++++++++++++ .../SpannerToSourceDbBacklogLT/session.json | 313 ++++++++++++++++++ .../spanner-schema.sql | 12 + 8 files changed, 663 insertions(+), 60 deletions(-) rename {v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting => it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql}/CloudSqlShardOrchestrator.java (90%) rename {v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting => it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/cloudsql}/CloudSqlShardOrchestratorTest.java (89%) create mode 100644 v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java create mode 100644 v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/session.json create mode 100644 v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/spanner-schema.sql diff --git a/.github/workflows/spanner-load-tests.yml b/.github/workflows/spanner-load-tests.yml index bca53402e2..3c553158a6 100644 --- a/.github/workflows/spanner-load-tests.yml +++ b/.github/workflows/spanner-load-tests.yml @@ -32,7 +32,7 @@ permissions: write-all jobs: load_tests: name: Spanner Dataflow Templates Load tests - timeout-minutes: 1440 # 1 day + timeout-minutes: 1800 # 30 hours # Run on any runner that matches all the specified runs-on values. runs-on: [ self-hosted, perf ] steps: @@ -57,6 +57,7 @@ jobs: --lt-export-project="cloud-teleport-testing" \ --lt-export-dataset="performance_tests" \ --lt-export-table="template_performance_metrics" \ + --test="${{ github.event.inputs.specific_test }}" - name: Upload Load Tests Report uses: actions/upload-artifact@v7 if: always() # always run even if the previous step fails diff --git a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/CloudSqlShardOrchestrator.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java similarity index 90% rename from v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/CloudSqlShardOrchestrator.java rename to it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java index 32e8853e77..d700409782 100644 --- a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/CloudSqlShardOrchestrator.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java @@ -13,7 +13,7 @@ * License for the specific language governing permissions and limitations under * the License. */ -package com.google.cloud.teleport.v2.templates.loadtesting; +package org.apache.beam.it.gcp.cloudsql; import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport; import com.google.api.client.googleapis.json.GoogleJsonResponseException; @@ -27,7 +27,6 @@ import com.google.auth.http.HttpCredentialsAdapter; import com.google.auth.oauth2.GoogleCredentials; import com.google.cloud.storage.BlobInfo; -import com.google.cloud.teleport.v2.source.reader.io.jdbc.iowrapper.config.SQLDialect; import com.google.common.annotations.VisibleForTesting; import java.io.IOException; import java.security.GeneralSecurityException; @@ -39,9 +38,6 @@ import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.beam.it.gcp.artifacts.GcsArtifact; -import org.apache.beam.it.gcp.cloudsql.CloudMySQLResourceManager; -import org.apache.beam.it.gcp.cloudsql.CloudPostgresResourceManager; -import org.apache.beam.it.gcp.cloudsql.CloudSqlResourceManager; import org.apache.beam.it.gcp.storage.GcsResourceManager; import org.json.JSONArray; import org.json.JSONObject; @@ -72,7 +68,12 @@ public class CloudSqlShardOrchestrator { public static final String MYSQL_8_0 = "MYSQL_8_0"; public static final String POSTGRES_14 = "POSTGRES_14"; - protected final SQLDialect sqlDialect; + public enum DatabaseType { + MYSQL, + POSTGRESQL + } + + protected final DatabaseType databaseType; protected final String dbVersion; protected final int port; protected final String project; @@ -80,21 +81,22 @@ public class CloudSqlShardOrchestrator { protected final String username; protected final String password; protected final GcsResourceManager gcsResourceManager; - protected final Map managers; + public final Map managers; protected final Map instanceIpMap; - protected Map> requestedShardMap; + public Map> requestedShardMap; protected final SQLAdmin sqlAdmin; /** * Constructs a new orchestrator for the specified database dialect. * - * @param dbType The dialect of the source database (e.g., MYSQL, POSTGRESQL). + * @param dbType The type of the source database (MYSQL or POSTGRESQL). + * @param dbVersion The version of the database. * @param project The GCP project ID. * @param region The GCP region for Cloud SQL instances. * @param gcsResourceManager The GCS resource manager for uploading configuration artifacts. */ public CloudSqlShardOrchestrator( - SQLDialect dbType, + DatabaseType dbType, String dbVersion, String project, String region, @@ -106,7 +108,7 @@ public CloudSqlShardOrchestrator( region, gcsResourceManager, System.getProperty( - "cloudProxyUsername", (dbType == SQLDialect.MYSQL) ? "root" : "postgres"), + "cloudProxyUsername", (dbType == DatabaseType.MYSQL) ? "root" : "postgres"), System.getProperty("cloudProxyPassword", ""), null); } @@ -114,7 +116,8 @@ public CloudSqlShardOrchestrator( /** * Constructs a new orchestrator with explicit credentials. * - * @param sqlDialect The dialect of the source database. + * @param databaseType The type of the source database. + * @param dbVersion The version of the database. * @param project The GCP project ID. * @param region The GCP region. * @param gcsResourceManager The GCS resource manager. @@ -123,7 +126,7 @@ public CloudSqlShardOrchestrator( * @param credentials The GCP credentials to use. */ public CloudSqlShardOrchestrator( - SQLDialect sqlDialect, + DatabaseType databaseType, String dbVersion, String project, String region, @@ -131,8 +134,8 @@ public CloudSqlShardOrchestrator( String username, String password, GoogleCredentials credentials) { - checkVersionCompatibility(sqlDialect, dbVersion); - this.sqlDialect = sqlDialect; + checkVersionCompatibility(databaseType, dbVersion); + this.databaseType = databaseType; this.dbVersion = dbVersion; this.project = project; this.region = region; @@ -158,14 +161,14 @@ public CloudSqlShardOrchestrator( LOG.error("Exception while initializing SQL Admin", e); throw new RuntimeException("Failed to initialize SQLAdmin client", e); } - port = (sqlDialect == SQLDialect.MYSQL) ? 3306 : 5432; + port = (databaseType == DatabaseType.MYSQL) ? 3306 : 5432; } @VisibleForTesting - protected static void checkVersionCompatibility(SQLDialect sqlDialect, String dbVersion) { + protected static void checkVersionCompatibility(DatabaseType databaseType, String dbVersion) { Preconditions.checkArgument( - (sqlDialect == SQLDialect.MYSQL && dbVersion.toLowerCase().startsWith("mysql")) - || (sqlDialect == SQLDialect.POSTGRESQL + (databaseType == DatabaseType.MYSQL && dbVersion.toLowerCase().startsWith("mysql")) + || (databaseType == DatabaseType.POSTGRESQL && dbVersion.toLowerCase().startsWith("postgres"))); } @@ -213,10 +216,9 @@ protected T executeWithRetries( * @param shardMap A mapping of physical instance names to the list of logical DB names to create. * @param artifactName The name of the artifact file (e.g., "shards.json"). * @return The full GCS URI to the generated bulkShardConfig.json. - * @throws ShardOrchestrationException if provisioning or creation fails after retries. + * @throws RuntimeException if provisioning or creation fails after retries. */ - public String initialize(Map> shardMap, String artifactName) - throws ShardOrchestrationException { + public String initialize(Map> shardMap, String artifactName) { this.requestedShardMap = new HashMap<>(shardMap); LOG.info("Initializing shard orchestrator for {} physical instances", shardMap.size()); @@ -231,7 +233,7 @@ public String initialize(Map> shardMap, String artifactName return generateAndUploadConfig(artifactName); } catch (Exception e) { LOG.error("Exception while initializing sharded environment", e); - throw new ShardOrchestrationException("Failed to initialize sharded environment", e); + throw new RuntimeException("Failed to initialize sharded environment", e); } } @@ -265,7 +267,7 @@ protected void updateUserPassword(String instanceName) throws IOException, Inter User user = new User().setName(username).setPassword(password); // MySQL requires a '%' host for connections from any IP within the VPC. - if (sqlDialect == SQLDialect.MYSQL) { + if (databaseType == DatabaseType.MYSQL) { user.setHost("%"); } @@ -273,7 +275,7 @@ protected void updateUserPassword(String instanceName) throws IOException, Inter // These are passed as query parameters in the Update request. SQLAdmin.Users.Update request = sqlAdmin.users().update(project, instanceName, user); request.setName(username); - if (sqlDialect == SQLDialect.MYSQL) { + if (databaseType == DatabaseType.MYSQL) { // In MySQL, host is part of the primary key for a user. '%' allows the user to connect from // any VPC IP. request.setHost("%"); @@ -319,7 +321,7 @@ protected String ensureInstanceAndGetIp(String instanceName) protected void createPhysicalInstance(String instanceName) throws IOException, InterruptedException { - String tier = sqlDialect == SQLDialect.MYSQL ? "db-n1-standard-2" : "db-custom-2-7680"; + String tier = databaseType == DatabaseType.MYSQL ? "db-n1-standard-2" : "db-custom-2-7680"; DatabaseInstance instance = new DatabaseInstance() .setName(instanceName) @@ -370,19 +372,19 @@ protected void waitForOperation(Operation operation) throws IOException, Interru protected CloudSqlResourceManager createManager(String instanceName) { String ip = instanceIpMap.get(instanceName); - if (sqlDialect == SQLDialect.MYSQL) { + if (databaseType == DatabaseType.MYSQL) { return (CloudSqlResourceManager) CloudMySQLResourceManager.builder(instanceName) .maybeUseStaticInstance(ip, port, username, password) .build(); - } else if (sqlDialect == SQLDialect.POSTGRESQL) { + } else if (databaseType == DatabaseType.POSTGRESQL) { return (CloudSqlResourceManager) CloudPostgresResourceManager.builder(instanceName) .maybeUseStaticInstance(ip, port, username, password) .setDatabaseName("postgres") .build(); } else { - throw new IllegalArgumentException("Unsupported database type: " + sqlDialect); + throw new IllegalArgumentException("Unsupported database type: " + databaseType); } } @@ -456,16 +458,16 @@ protected void awaitAndShutdownExecutor( executor.shutdown(); try { if (!executor.awaitTermination(60, TimeUnit.MINUTES)) { - throw new ShardOrchestrationException(phase + " phase timed out"); + throw new RuntimeException(phase + " phase timed out"); } for (java.util.concurrent.Future future : futures) { future.get(); } } catch (java.util.concurrent.ExecutionException e) { - throw new ShardOrchestrationException(phase + " phase failed", e.getCause()); + throw new RuntimeException(phase + " phase failed", e.getCause()); } catch (InterruptedException e) { Thread.currentThread().interrupt(); - throw new ShardOrchestrationException(phase + " phase interrupted", e); + throw new RuntimeException(phase + " phase interrupted", e); } } diff --git a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/CloudSqlShardOrchestratorTest.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestratorTest.java similarity index 89% rename from v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/CloudSqlShardOrchestratorTest.java rename to it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestratorTest.java index 48a40db735..29b348c896 100644 --- a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/CloudSqlShardOrchestratorTest.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestratorTest.java @@ -13,11 +13,11 @@ * License for the specific language governing permissions and limitations under * the License. */ -package com.google.cloud.teleport.v2.templates.loadtesting; +package org.apache.beam.it.gcp.cloudsql; -import static com.google.cloud.teleport.v2.templates.loadtesting.CloudSqlShardOrchestrator.MYSQL_8_0; -import static com.google.cloud.teleport.v2.templates.loadtesting.CloudSqlShardOrchestrator.POSTGRES_14; import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.MYSQL_8_0; +import static org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.POSTGRES_14; import static org.junit.Assert.assertThrows; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyInt; @@ -41,7 +41,6 @@ import com.google.auth.oauth2.GoogleCredentials; import com.google.cloud.storage.Blob; import com.google.cloud.storage.BlobInfo; -import com.google.cloud.teleport.v2.source.reader.io.jdbc.iowrapper.config.SQLDialect; import com.google.common.util.concurrent.MoreExecutors; import java.io.IOException; import java.util.Arrays; @@ -50,8 +49,7 @@ import java.util.Map; import java.util.concurrent.ExecutorService; import org.apache.beam.it.gcp.artifacts.GcsArtifact; -import org.apache.beam.it.gcp.cloudsql.CloudMySQLResourceManager; -import org.apache.beam.it.gcp.cloudsql.CloudPostgresResourceManager; +import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.DatabaseType; import org.apache.beam.it.gcp.storage.GcsResourceManager; import org.junit.Before; import org.junit.Test; @@ -88,13 +86,13 @@ public class CloudSqlShardOrchestratorTest { /** Helper subclass to inject mock SQLAdmin and synchronous executor. */ private static class TestCloudSqlShardOrchestrator extends CloudSqlShardOrchestrator { public TestCloudSqlShardOrchestrator( - SQLDialect sqlDialect, + DatabaseType databaseType, String dbVersion, String project, String region, GcsResourceManager gcsResourceManager, SQLAdmin sqlAdmin) { - super(sqlDialect, dbVersion, project, region, gcsResourceManager); + super(databaseType, dbVersion, project, region, gcsResourceManager); // Overwrite the real sqlAdmin created in super constructor try { java.lang.reflect.Field field = @@ -138,7 +136,7 @@ public void setUp() { public void testInitialize_provisionsAndSetsUpCorrectly() throws Exception { TestCloudSqlShardOrchestrator orchestrator = new TestCloudSqlShardOrchestrator( - SQLDialect.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); + DatabaseType.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); // Mock Stage 1: Physical Provisioning DatabaseInstance instance = @@ -189,7 +187,7 @@ public void testInitialize_provisionsAndSetsUpCorrectly() throws Exception { public void testInitialize_createsInstance_whenMissing() throws Exception { TestCloudSqlShardOrchestrator orchestrator = new TestCloudSqlShardOrchestrator( - SQLDialect.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); + DatabaseType.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); // Mock 404 for first call, then 200 for refresh when(sqlAdmin.instances().get(PROJECT_ID, INSTANCE_NAME).execute()) @@ -228,7 +226,7 @@ public void testInitialize_createsInstance_whenMissing() throws Exception { public void testCleanup_delegatesToManagers() throws Exception { TestCloudSqlShardOrchestrator orchestrator = new TestCloudSqlShardOrchestrator( - SQLDialect.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); + DatabaseType.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); // Mock successful initialization to populate managers map when(sqlAdmin.instances().get(PROJECT_ID, INSTANCE_NAME).execute()) @@ -272,7 +270,7 @@ public void testConstructor_withDefaultCredentials() throws Exception { CloudSqlShardOrchestrator orchestrator = new CloudSqlShardOrchestrator( - SQLDialect.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager); + DatabaseType.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager); assertThat(orchestrator.project).isEqualTo(PROJECT_ID); } @@ -282,7 +280,7 @@ public void testConstructor_withDefaultCredentials() throws Exception { public void testInitialize_provisionsAndSetsUpPostgresCorrectly() throws Exception { TestCloudSqlShardOrchestrator orchestrator = new TestCloudSqlShardOrchestrator( - SQLDialect.POSTGRESQL, POSTGRES_14, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); + DatabaseType.POSTGRESQL, POSTGRES_14, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); DatabaseInstance instance = new DatabaseInstance() @@ -318,20 +316,19 @@ public void testInitialize_provisionsAndSetsUpPostgresCorrectly() throws Excepti public void testInitialize_throwsOnFailure() throws Exception { TestCloudSqlShardOrchestrator orchestrator = new TestCloudSqlShardOrchestrator( - SQLDialect.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); + DatabaseType.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); when(sqlAdmin.instances().get(anyString(), anyString()).execute()) .thenThrow(new IOException("API Error")); - assertThrows( - ShardOrchestrationException.class, () -> orchestrator.initialize(shardMap, "shards.json")); + assertThrows(RuntimeException.class, () -> orchestrator.initialize(shardMap, "shards.json")); } @Test public void testExecuteWithRetries_retriesOn409() throws Exception { TestCloudSqlShardOrchestrator orchestrator = new TestCloudSqlShardOrchestrator( - SQLDialect.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); + DatabaseType.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); com.google.api.client.googleapis.services.AbstractGoogleClientRequest mockRequest = @@ -355,7 +352,7 @@ public void testExecuteWithRetries_retriesOn409() throws Exception { public void testWaitForOperation_throwsOnOpError() throws Exception { TestCloudSqlShardOrchestrator orchestrator = new TestCloudSqlShardOrchestrator( - SQLDialect.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); + DatabaseType.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); Operation op = mock(Operation.class, Answers.RETURNS_DEEP_STUBS); when(op.getName()).thenReturn("op-error"); @@ -373,7 +370,7 @@ public void testWaitForOperation_throwsOnOpError() throws Exception { public void testCleanup_handlesDropDatabaseError() throws Exception { TestCloudSqlShardOrchestrator orchestrator = new TestCloudSqlShardOrchestrator( - SQLDialect.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); + DatabaseType.MYSQL, MYSQL_8_0, PROJECT_ID, REGION, gcsResourceManager, sqlAdmin); // Mock successful initialization DatabaseInstance instance = @@ -411,13 +408,14 @@ public void testCleanup_handlesDropDatabaseError() throws Exception { public void testVersionCompatability() { assertThrows( IllegalArgumentException.class, - () -> CloudSqlShardOrchestrator.checkVersionCompatibility(SQLDialect.MYSQL, POSTGRES_14)); + () -> CloudSqlShardOrchestrator.checkVersionCompatibility(DatabaseType.MYSQL, POSTGRES_14)); assertThrows( IllegalArgumentException.class, () -> - CloudSqlShardOrchestrator.checkVersionCompatibility(SQLDialect.POSTGRESQL, MYSQL_8_0)); + CloudSqlShardOrchestrator.checkVersionCompatibility( + DatabaseType.POSTGRESQL, MYSQL_8_0)); - CloudSqlShardOrchestrator.checkVersionCompatibility(SQLDialect.MYSQL, MYSQL_8_0); - CloudSqlShardOrchestrator.checkVersionCompatibility(SQLDialect.POSTGRESQL, POSTGRES_14); + CloudSqlShardOrchestrator.checkVersionCompatibility(DatabaseType.MYSQL, MYSQL_8_0); + CloudSqlShardOrchestrator.checkVersionCompatibility(DatabaseType.POSTGRESQL, POSTGRES_14); } } diff --git a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/MySQLMultiSharded1024ShardsLT.java b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/MySQLMultiSharded1024ShardsLT.java index 2119674c6a..cd5681ebc9 100644 --- a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/MySQLMultiSharded1024ShardsLT.java +++ b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/MySQLMultiSharded1024ShardsLT.java @@ -15,8 +15,8 @@ */ package com.google.cloud.teleport.v2.templates.loadtesting; -import static com.google.cloud.teleport.v2.templates.loadtesting.CloudSqlShardOrchestrator.MYSQL_8_0; import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.MYSQL_8_0; import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatResult; import com.google.cloud.spanner.Struct; @@ -45,6 +45,8 @@ import org.apache.beam.it.common.utils.ResourceManagerUtils; import org.apache.beam.it.gcp.artifacts.GcsArtifact; import org.apache.beam.it.gcp.cloudsql.CloudSqlResourceManager; +import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator; +import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.DatabaseType; import org.apache.beam.it.gcp.spanner.SpannerResourceManager; import org.json.JSONArray; import org.json.JSONObject; @@ -101,7 +103,7 @@ public void setUp() throws IOException { orchestrator = new CloudSqlShardOrchestrator( - SQLDialect.MYSQL, MYSQL_8_0, project, region, gcsResourceManager); + DatabaseType.MYSQL, MYSQL_8_0, project, region, gcsResourceManager); } @After diff --git a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/PostgreSQLMultiSharded1024ShardsLT.java b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/PostgreSQLMultiSharded1024ShardsLT.java index 85ea94a535..1ade4d59ea 100644 --- a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/PostgreSQLMultiSharded1024ShardsLT.java +++ b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/PostgreSQLMultiSharded1024ShardsLT.java @@ -15,8 +15,8 @@ */ package com.google.cloud.teleport.v2.templates.loadtesting; -import static com.google.cloud.teleport.v2.templates.loadtesting.CloudSqlShardOrchestrator.POSTGRES_14; import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.POSTGRES_14; import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatResult; import com.google.cloud.spanner.Struct; @@ -45,6 +45,8 @@ import org.apache.beam.it.common.utils.ResourceManagerUtils; import org.apache.beam.it.gcp.artifacts.GcsArtifact; import org.apache.beam.it.gcp.cloudsql.CloudSqlResourceManager; +import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator; +import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.DatabaseType; import org.apache.beam.it.gcp.spanner.SpannerResourceManager; import org.json.JSONArray; import org.json.JSONObject; @@ -102,7 +104,7 @@ public void setUp() throws IOException { orchestrator = new CloudSqlShardOrchestrator( - SQLDialect.POSTGRESQL, POSTGRES_14, project, region, gcsResourceManager); + DatabaseType.POSTGRESQL, POSTGRES_14, project, region, gcsResourceManager); } @After diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java new file mode 100644 index 0000000000..efc8432517 --- /dev/null +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.templates; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import com.google.cloud.spanner.Mutation; +import com.google.cloud.spanner.Struct; +import com.google.cloud.teleport.metadata.TemplateLoadTest; +import com.google.cloud.teleport.v2.spanner.migrations.shard.Shard; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.beam.it.gcp.cloudsql.CloudSqlResourceManager; +import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator; +import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.DatabaseType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Step 1 Validation / Setup-Only test for {@link SpannerToSourceDb} template. + * + *

Objective: Verify initial setup, DDL, schemas, GCS artifacts, and CloudSQL connectivity before + * launching the massive backlog load test. + * + *

This setup utilizes the programmatic {@link CloudSqlShardOrchestrator} to dynamically + * provision and manage physical instances over Private IPs inside the target VPC, completely + * bypassing proxy requirements. + */ +@Category(TemplateLoadTest.class) +@TemplateLoadTest(SpannerToSourceDb.class) +@RunWith(JUnit4.class) +public class SpannerToSourceDbBacklogStepLT extends SpannerToSourceDbLTBase { + + private static final Logger LOG = LoggerFactory.getLogger(SpannerToSourceDbBacklogStepLT.class); + + private final String spannerDdlResource = "SpannerToSourceDbBacklogLT/spanner-schema.sql"; + private final String sessionFileResource = "SpannerToSourceDbBacklogLT/session.json"; + + private CloudSqlShardOrchestrator orchestrator; + + @Before + public void setup() throws IOException { + LOG.info( + "Initializing resource managers for Step 1 Setup & Connectivity validation via Orchestrator..."); + + String password = System.getProperty("cloudProxyPassword", "Welcome@1"); + System.setProperty("cloudProxyPassword", password); + + // Setup Spanner database and metadata database, GCS artifact resource manager, and session + // files + setupResourceManagers(spannerDdlResource, sessionFileResource); + + // Initialize the Cloud SQL Shard Orchestrator for dynamic GCP-level provisioning over Private + // IP + orchestrator = + new CloudSqlShardOrchestrator( + DatabaseType.MYSQL, + CloudSqlShardOrchestrator.MYSQL_8_0, + project, + region, + gcsResourceManager); + + Map> shardMap = new HashMap<>(); + shardMap.put("nokill-high-resources-backlog-shard1", List.of("shard0", "shard1")); + shardMap.put("nokill-high-resources-backlog-shard2", List.of("shard2", "shard3")); + + // Initialize the physical instances (reusing existing ones) and logical schemas + orchestrator.initialize(shardMap, "orchestrator_shards_bulk.json"); + + // Create logical table schemas inside each database shard + LOG.info("Creating logical schemas on MySQL shards..."); + CloudSqlResourceManager manager1 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); + CloudSqlResourceManager manager2 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); + createLogicalTableSchema(manager1, "shard0"); + createLogicalTableSchema(manager1, "shard1"); + createLogicalTableSchema(manager2, "shard2"); + createLogicalTableSchema(manager2, "shard3"); + + // Upload sharding configuration in the flat format expected by SpannerToSourceDb + LOG.info("Generating and uploading flat sharding configuration to GCS..."); + createAndUploadShardConfigToGcs(); + } + + @After + public void tearDown() { + LOG.info("Cleaning up resources..."); + cleanupResourceManagers(); + if (orchestrator != null) { + orchestrator.cleanup(); + } + } + + @Test + public void test1_setupAndConnectivitySanity() throws IOException { + LOG.info("Running Step 1 Setup and Connectivity Sanity check..."); + + // 1. Verify Spanner Connectivity and Table DDL + LOG.info("Verifying Spanner database connectivity and schema..."); + assertNotNull("Spanner resource manager should be initialized", spannerResourceManager); + + String testId = "test-id-12345"; + String testPayload = "test-payload-step1"; + String testShardId = "shard_0"; + + // Write a row directly to Spanner + LOG.info("Writing a test row directly to Spanner..."); + List mutations = new ArrayList<>(); + mutations.add( + Mutation.newInsertOrUpdateBuilder("MigrationLoadTest") + .set("Id") + .to(testId) + .set("Payload") + .to(testPayload) + .set("migration_shard_id") + .to(testShardId) + .build()); + spannerResourceManager.write(mutations); + + // Read the row back from Spanner to verify + LOG.info("Reading the test row back from Spanner..."); + List results = + spannerResourceManager.runQuery( + String.format( + "SELECT Payload FROM MigrationLoadTest WHERE migration_shard_id = '%s' AND Id = '%s'", + testShardId, testId)); + assertNotNull("Results from Spanner should not be null", results); + assertEquals("Should return exactly 1 row", 1, results.size()); + assertEquals( + "Payload matches what was written to Spanner", + testPayload, + results.get(0).getString("Payload")); + + // Delete test row from Spanner + LOG.info("Deleting test row from Spanner..."); + spannerResourceManager.write( + List.of( + Mutation.delete( + "MigrationLoadTest", com.google.cloud.spanner.Key.of(testShardId, testId)))); + + // 2. Verify GCS Artifacts (Session and Sharding configurations) + LOG.info("Verifying GCS configuration artifacts..."); + assertNotNull("GCS resource manager should be initialized", gcsResourceManager); + + String sessionGcsPath = getGcsPath(SESSION_FILE_NAME, gcsResourceManager); + LOG.info("Session file GCS Path: {}", sessionGcsPath); + assertTrue("Session file should exist on GCS", sessionGcsPath.startsWith("gs://")); + + String shardGcsPath = getGcsPath(SOURCE_SHARDS_FILE_NAME, gcsResourceManager); + LOG.info("Shard file GCS Path: {}", shardGcsPath); + assertTrue("Shard file should exist on GCS", shardGcsPath.startsWith("gs://")); + + // 3. Verify MySQL Connectivity, DDL, and Shards + LOG.info("Verifying CloudSQL MySQL Shard connectivity and DDL..."); + CloudSqlResourceManager manager1 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); + CloudSqlResourceManager manager2 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); + assertNotNull("Shard 1 resource manager should be initialized", manager1); + assertNotNull("Shard 2 resource manager should be initialized", manager2); + + // Write and read from logical database shard0 (on Shard 1 physical instance) + verifyMySqlLogicalShard(manager1, "shard0"); + // Write and read from logical database shard1 (on Shard 1 physical instance) + verifyMySqlLogicalShard(manager1, "shard1"); + // Write and read from logical database shard2 (on Shard 2 physical instance) + verifyMySqlLogicalShard(manager2, "shard2"); + // Write and read from logical database shard3 (on Shard 2 physical instance) + verifyMySqlLogicalShard(manager2, "shard3"); + + LOG.info("Step 1 Setup and Connectivity Sanity check passed successfully! All systems are GO."); + } + + private void verifyMySqlLogicalShard(CloudSqlResourceManager manager, String dbName) { + LOG.info("Verifying logical database: {}...", dbName); + + String testId = "test-id-" + dbName; + String testPayload = "payload-" + dbName; + + // Insert test row + String insertSql = + String.format( + "INSERT INTO %s.MigrationLoadTest (Id, Payload) VALUES ('%s', '%s')", + dbName, testId, testPayload); + manager.runSQLUpdate(insertSql); + + // Query test row back + String selectSql = + String.format("SELECT Payload FROM %s.MigrationLoadTest WHERE Id = '%s'", dbName, testId); + List> result = manager.runSQLQuery(selectSql); + + assertNotNull("Result from MySQL logical shard " + dbName + " should not be null", result); + assertEquals("Should return exactly 1 row", 1, result.size()); + assertEquals( + "Payload matches what was written to " + dbName, testPayload, result.get(0).get("Payload")); + + // Cleanup test row + String deleteSql = + String.format("DELETE FROM %s.MigrationLoadTest WHERE Id = '%s'", dbName, testId); + manager.runSQLUpdate(deleteSql); + LOG.info("Logical database {} verified successfully.", dbName); + } + + private void createLogicalTableSchema(CloudSqlResourceManager manager, String dbName) { + manager.runSQLUpdate( + "CREATE TABLE IF NOT EXISTS " + + dbName + + ".MigrationLoadTest (" + + "Id VARCHAR(36) NOT NULL," + + "Payload LONGTEXT NOT NULL," + + "PRIMARY KEY (Id)" + + ") ENGINE=InnoDB"); + } + + private void createAndUploadShardConfigToGcs() throws IOException { + CloudSqlResourceManager manager1 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); + CloudSqlResourceManager manager2 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); + + JsonArray ja = new JsonArray(); + ja.add(createShardConfig("shard_0", "shard0", manager1)); + ja.add(createShardConfig("shard_1", "shard1", manager1)); + ja.add(createShardConfig("shard_2", "shard2", manager2)); + ja.add(createShardConfig("shard_3", "shard3", manager2)); + + String shardFileContents = ja.toString(); + LOG.info("Shard file contents: {}", shardFileContents); + gcsResourceManager.createArtifact(SOURCE_SHARDS_FILE_NAME, shardFileContents); + } + + private JsonObject createShardConfig( + String logicalShardId, String dbName, CloudSqlResourceManager manager) { + Shard shard = new Shard(); + shard.setLogicalShardId(logicalShardId); + shard.setUser(manager.getUsername()); + shard.setHost(manager.getHost()); + shard.setPassword(manager.getPassword()); + shard.setPort(String.valueOf(manager.getPort())); + shard.setDbName(dbName); + JsonObject jsObj = (JsonObject) new Gson().toJsonTree(shard).getAsJsonObject(); + jsObj.remove("secretManagerUri"); + return jsObj; + } +} diff --git a/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/session.json b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/session.json new file mode 100644 index 0000000000..f8035b77a0 --- /dev/null +++ b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/session.json @@ -0,0 +1,313 @@ +{ + "SpSchema": { + "t1": { + "Name": "MigrationLoadTest", + "ColIds": [ + "c2", + "c3", + "c4" + ], + "ShardIdColumn": "c4", + "ColDefs": { + "c2": { + "Name": "Id", + "T": { + "Name": "STRING", + "Len": 36, + "IsArray": false + }, + "NotNull": true, + "Comment": "From: Id varchar(36)", + "Id": "c2", + "AutoGen": { + "Name": "", + "GenerationType": "", + "IdentityOptions": { + "SkipRangeMin": "", + "SkipRangeMax": "", + "StartCounterWith": "" + } + }, + "DefaultValue": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + } + }, + "GeneratedColumn": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + }, + "Type": "" + }, + "Opts": null + }, + "c3": { + "Name": "Payload", + "T": { + "Name": "STRING", + "Len": 9223372036854775807, + "IsArray": false + }, + "NotNull": true, + "Comment": "From: Payload longtext(4294967295)", + "Id": "c3", + "AutoGen": { + "Name": "", + "GenerationType": "", + "IdentityOptions": { + "SkipRangeMin": "", + "SkipRangeMax": "", + "StartCounterWith": "" + } + }, + "DefaultValue": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + } + }, + "GeneratedColumn": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + }, + "Type": "" + }, + "Opts": null + }, + "c4": { + "Name": "migration_shard_id", + "T": { + "Name": "STRING", + "Len": 50, + "IsArray": false + }, + "NotNull": false, + "Comment": "", + "Id": "c4", + "AutoGen": { + "Name": "", + "GenerationType": "", + "IdentityOptions": { + "SkipRangeMin": "", + "SkipRangeMax": "", + "StartCounterWith": "" + } + }, + "DefaultValue": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + } + }, + "GeneratedColumn": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + }, + "Type": "" + }, + "Opts": null + } + }, + "PrimaryKeys": [ + { + "ColId": "c2", + "Desc": false, + "Order": 2 + }, + { + "ColId": "c4", + "Desc": false, + "Order": 1 + } + ], + "ForeignKeys": null, + "Indexes": null, + "ParentTable": { + "Id": "", + "OnDelete": "", + "InterleaveType": "" + }, + "CheckConstraints": null, + "Comment": "Spanner schema for source table MigrationLoadTest", + "Id": "t1" + } + }, + "SyntheticPKeys": {}, + "SrcSchema": { + "t1": { + "Name": "MigrationLoadTest", + "Schema": "shard0", + "ColIds": [ + "c2", + "c3" + ], + "ColDefs": { + "c2": { + "Name": "Id", + "Type": { + "Name": "varchar", + "Mods": [ + 36 + ], + "ArrayBounds": null + }, + "NotNull": true, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c2", + "AutoGen": { + "Name": "", + "GenerationType": "", + "IdentityOptions": { + "SkipRangeMin": "", + "SkipRangeMax": "", + "StartCounterWith": "" + } + }, + "DefaultValue": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + } + }, + "GeneratedColumn": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + }, + "Type": "" + } + }, + "c3": { + "Name": "Payload", + "Type": { + "Name": "longtext", + "Mods": [ + 4294967295 + ], + "ArrayBounds": null + }, + "NotNull": true, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c3", + "AutoGen": { + "Name": "", + "GenerationType": "", + "IdentityOptions": { + "SkipRangeMin": "", + "SkipRangeMax": "", + "StartCounterWith": "" + } + }, + "DefaultValue": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + } + }, + "GeneratedColumn": { + "IsPresent": false, + "Value": { + "ExpressionId": "", + "Statement": "" + }, + "Type": "" + } + } + }, + "PrimaryKeys": [ + { + "ColId": "c2", + "Desc": false, + "Order": 1 + } + ], + "ForeignKeys": null, + "CheckConstraints": null, + "Indexes": null, + "Id": "t1" + } + }, + "SchemaIssues": { + "t1": { + "ColumnLevelIssues": { + "c4": [ + 29 + ] + }, + "TableLevelIssues": null + } + }, + "InvalidCheckExp": null, + "ToSpanner": { + "MigrationLoadTest": { + "Name": "MigrationLoadTest", + "Cols": { + "Id": "Id", + "Payload": "Payload" + } + } + }, + "Location": {}, + "TimezoneOffset": "+00:00", + "SpDialect": "google_standard_sql", + "UniquePKey": {}, + "Rules": [ + { + "Id": "r5", + "Name": "r5", + "Type": "add_shard_id_primary_key", + "ObjectType": "", + "AssociatedObjects": "All Tables", + "Enabled": true, + "Data": { + "AddedAtTheStart": true + }, + "AddedOn": null + } + ], + "IsSharded": true, + "SpRegion": "", + "ResourceValidation": false, + "UI": true, + "SpSequences": {}, + "SrcSequences": {}, + "SpProjectId": "span-cloud-ck-testing-external", + "SpInstanceId": "ea-functional-tests", + "Source": "mysql", + "DatabaseOptions": { + "DbName": "", + "DefaultTimezone": "" + }, + "DefaultIdentityOptions": { + "SkipRangeMin": "", + "SkipRangeMax": "", + "StartCounterWith": "" + } +} \ No newline at end of file diff --git a/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/spanner-schema.sql b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/spanner-schema.sql new file mode 100644 index 0000000000..5e36104ff9 --- /dev/null +++ b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/spanner-schema.sql @@ -0,0 +1,12 @@ +CREATE TABLE MigrationLoadTest ( + Id STRING(36) NOT NULL, + Payload STRING(MAX) NOT NULL, + migration_shard_id STRING(50), +) PRIMARY KEY (migration_shard_id, Id); + +CREATE CHANGE STREAM MigrationStream +FOR MigrationLoadTest +OPTIONS ( + retention_period = '7d', + value_capture_type = 'OLD_AND_NEW_VALUES' +); From f1289c4330bf781d6a9fb573f9c50e6471857907 Mon Sep 17 00:00:00 2001 From: aasthabharill Date: Thu, 7 May 2026 18:11:36 +0530 Subject: [PATCH 2/6] import test --- .../SpannerToSourceDbBacklogStepLT.java | 159 +++++++++++++++++- 1 file changed, 156 insertions(+), 3 deletions(-) diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java index efc8432517..fded96c856 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java @@ -15,6 +15,9 @@ */ package com.google.cloud.teleport.v2.templates; +import static org.apache.beam.it.common.TestProperties.getProperty; +import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatPipeline; +import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatResult; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; @@ -27,13 +30,18 @@ import com.google.gson.JsonArray; import com.google.gson.JsonObject; import java.io.IOException; +import java.time.Duration; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.beam.it.common.PipelineLauncher; +import org.apache.beam.it.common.PipelineOperator; +import org.apache.beam.it.common.TestProperties; import org.apache.beam.it.gcp.cloudsql.CloudSqlResourceManager; import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator; import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.DatabaseType; +import org.apache.beam.it.gcp.dataflow.ClassicTemplateClient; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -62,17 +70,16 @@ public class SpannerToSourceDbBacklogStepLT extends SpannerToSourceDbLTBase { private final String spannerDdlResource = "SpannerToSourceDbBacklogLT/spanner-schema.sql"; private final String sessionFileResource = "SpannerToSourceDbBacklogLT/session.json"; + private final String table = "MigrationLoadTest"; private CloudSqlShardOrchestrator orchestrator; + private Integer originalSpannerNodeCount = null; @Before public void setup() throws IOException { LOG.info( "Initializing resource managers for Step 1 Setup & Connectivity validation via Orchestrator..."); - String password = System.getProperty("cloudProxyPassword", "Welcome@1"); - System.setProperty("cloudProxyPassword", password); - // Setup Spanner database and metadata database, GCS artifact resource manager, and session // files setupResourceManagers(spannerDdlResource, sessionFileResource); @@ -113,6 +120,16 @@ public void setup() throws IOException { @After public void tearDown() { LOG.info("Cleaning up resources..."); + + // Reset Spanner instance to its original node count if it was modified + if (originalSpannerNodeCount != null && spannerResourceManager != null) { + try { + updateSpannerNodeCount(spannerResourceManager.getInstanceId(), originalSpannerNodeCount); + } catch (Exception e) { + LOG.warn("Failed to reset Spanner node count during teardown: ", e); + } + } + cleanupResourceManagers(); if (orchestrator != null) { orchestrator.cleanup(); @@ -270,4 +287,140 @@ private JsonObject createShardConfig( jsObj.remove("secretManagerUri"); return jsObj; } + + @Test + public void test2_avroImportSanity() throws IOException, InterruptedException { + LOG.info("Running Step 2 Avro Import Sanity check..."); + + // Get parameters + String avroInputDir = + getProperty( + "avroInputDir", + "gs://nokill-spanner-to-sourcedb-load/small-data/avro/", + TestProperties.Type.PROPERTY); + long expectedSpannerCount = + Long.parseLong(getProperty("expectedSpannerCount", "100", TestProperties.Type.PROPERTY)); + int importTimeoutMinutes = + Integer.parseInt(getProperty("importTimeoutMinutes", "15", TestProperties.Type.PROPERTY)); + + // Ensure avroInputDir ends with a trailing slash for the classic import template + if (!avroInputDir.endsWith("/")) { + avroInputDir = avroInputDir + "/"; + } + + LOG.info("Avro Input Directory: {}", avroInputDir); + LOG.info("Expected Spanner count: {}", expectedSpannerCount); + + int scaleNodes = + Integer.parseInt(getProperty("spannerScaleNodes", "25", TestProperties.Type.PROPERTY)); + + LOG.info("Scaling up Spanner instance to {} nodes before starting import job...", scaleNodes); + updateSpannerNodeCount(spannerResourceManager.getInstanceId(), scaleNodes); + + // Assert/Verify that the node count was updated successfully + int currentNodeCount = getSpannerNodeCount(spannerResourceManager.getInstanceId()); + LOG.info("Verified current Spanner instance node count is: {}", currentNodeCount); + assertEquals( + "Spanner instance node count mismatch after scale-up", scaleNodes, currentNodeCount); + + // Launch Avro-to-Spanner import job + LOG.info("Launching classic GCS Avro to Cloud Spanner Import job with a small dataset..."); + PipelineLauncher.LaunchInfo importJobInfo = launchClassicImportJob(avroInputDir); + assertThatPipeline(importJobInfo).isRunning(); + + // Wait for the Import job to complete with a fail-fast timeout (e.g., 15 minutes) + LOG.info( + "Waiting for Spanner import job to finish (timeout: {} mins)...", importTimeoutMinutes); + PipelineOperator.Result importResult = + pipelineOperator.waitUntilDone( + createConfig(importJobInfo, Duration.ofMinutes(importTimeoutMinutes))); + + assertThatResult(importResult).isLaunchFinished(); + LOG.info("Import job completed successfully."); + + // Assert that the rows have been imported correctly into Spanner + long spannerCount = spannerResourceManager.getRowCount(table); + LOG.info("Spanner database row count after import: {}", spannerCount); + assertEquals("Spanner database row count mismatch", expectedSpannerCount, spannerCount); + + LOG.info("Step 2 Avro Import Sanity check passed successfully! Import is verified."); + } + + private PipelineLauncher.LaunchInfo launchClassicImportJob(String inputDir) throws IOException { + ClassicTemplateClient classicClient = ClassicTemplateClient.builder(CREDENTIALS).build(); + + Map params = new HashMap<>(); + params.put("instanceId", spannerResourceManager.getInstanceId()); + params.put("databaseId", spannerResourceManager.getDatabaseId()); + params.put("inputDir", inputDir); + + PipelineLauncher.LaunchConfig options = + PipelineLauncher.LaunchConfig.builder( + "spanner-avro-import-sanity", + "gs://dataflow-templates/latest/GCS_Avro_to_Cloud_Spanner") + .setParameters(params) + .addEnvironment("numWorkers", 80) + .addEnvironment("maxWorkers", 120) + .addEnvironment("machineType", "n2-standard-8") + .build(); + + return classicClient.launch(project, region, options); + } + + /** + * Programmatically updates the node count of the Spanner instance. Useful for scaling up before + * heavy loads and downscaling afterwards. + */ + public void updateSpannerNodeCount(String instanceId, int nodeCount) { + com.google.cloud.spanner.SpannerOptions options = + com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); + try (com.google.cloud.spanner.Spanner spanner = options.getService()) { + com.google.cloud.spanner.InstanceAdminClient instanceAdminClient = + spanner.getInstanceAdminClient(); + + // Capture the original node count before the first modification + if (originalSpannerNodeCount == null) { + com.google.cloud.spanner.Instance instance = instanceAdminClient.getInstance(instanceId); + originalSpannerNodeCount = instance.getNodeCount(); + LOG.info( + "Captured original Spanner instance {} node count: {}", + instanceId, + originalSpannerNodeCount); + } + + LOG.info( + "Updating Spanner instance {} node count from {} to {}...", + instanceId, + originalSpannerNodeCount, + nodeCount); + com.google.cloud.spanner.InstanceInfo instanceInfo = + com.google.cloud.spanner.InstanceInfo.newBuilder( + com.google.cloud.spanner.InstanceId.of(project, instanceId)) + .setNodeCount(nodeCount) + .build(); + instanceAdminClient + .updateInstance( + instanceInfo, com.google.cloud.spanner.InstanceInfo.InstanceField.NODE_COUNT) + .get(); + LOG.info("Successfully updated Spanner instance {} node count to {}.", instanceId, nodeCount); + } catch (Exception e) { + LOG.error("Failed to update Spanner instance node count.", e); + throw new RuntimeException("Failed to update Spanner node count", e); + } + } + + /** Programmatically retrieves the current node count of the Spanner instance. */ + public int getSpannerNodeCount(String instanceId) { + com.google.cloud.spanner.SpannerOptions options = + com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); + try (com.google.cloud.spanner.Spanner spanner = options.getService()) { + com.google.cloud.spanner.InstanceAdminClient instanceAdminClient = + spanner.getInstanceAdminClient(); + com.google.cloud.spanner.Instance instance = instanceAdminClient.getInstance(instanceId); + return instance.getNodeCount(); + } catch (Exception e) { + LOG.error("Failed to retrieve Spanner instance node count.", e); + throw new RuntimeException("Failed to get Spanner node count", e); + } + } } From a45aef10ae6ef7e7fc3eef0c32fdb3d4b661c9f2 Mon Sep 17 00:00:00 2001 From: aasthabharill Date: Thu, 7 May 2026 19:07:10 +0530 Subject: [PATCH 3/6] reverse test --- .../SpannerToSourceDbBacklogStepLT.java | 320 ++++++++++++------ 1 file changed, 208 insertions(+), 112 deletions(-) diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java index fded96c856..2b669ed523 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java @@ -52,14 +52,12 @@ import org.slf4j.LoggerFactory; /** - * Step 1 Validation / Setup-Only test for {@link SpannerToSourceDb} template. + * Progressive, fail-fast Consolidated End-to-End Load/Sanity test for {@link SpannerToSourceDb} + * template. * - *

Objective: Verify initial setup, DDL, schemas, GCS artifacts, and CloudSQL connectivity before - * launching the massive backlog load test. - * - *

This setup utilizes the programmatic {@link CloudSqlShardOrchestrator} to dynamically - * provision and manage physical instances over Private IPs inside the target VPC, completely - * bypassing proxy requirements. + *

Objective: Validate Spanner, GCS config, CloudSQL connectivity (Step 1), classic Avro Import + * template upscaling and execution (Step 2), and SpannerToSourceDb reverse replication E2E sanity + * backlog migration (Step 3) back-to-back in under 15 minutes using a tiny dataset. */ @Category(TemplateLoadTest.class) @TemplateLoadTest(SpannerToSourceDb.class) @@ -68,17 +66,21 @@ public class SpannerToSourceDbBacklogStepLT extends SpannerToSourceDbLTBase { private static final Logger LOG = LoggerFactory.getLogger(SpannerToSourceDbBacklogStepLT.class); + private static final String TEMPLATE_SPEC_PATH = + com.google.common.base.MoreObjects.firstNonNull( + TestProperties.specPath(), "gs://dataflow-templates/latest/flex/Spanner_to_SourceDb"); + private final String spannerDdlResource = "SpannerToSourceDbBacklogLT/spanner-schema.sql"; private final String sessionFileResource = "SpannerToSourceDbBacklogLT/session.json"; private final String table = "MigrationLoadTest"; private CloudSqlShardOrchestrator orchestrator; private Integer originalSpannerNodeCount = null; + private Integer originalSpannerMetadataNodeCount = null; @Before public void setup() throws IOException { - LOG.info( - "Initializing resource managers for Step 1 Setup & Connectivity validation via Orchestrator..."); + LOG.info("Initializing resource managers for Consolidated E2E Sanity Test via Orchestrator..."); // Setup Spanner database and metadata database, GCS artifact resource manager, and session // files @@ -115,6 +117,11 @@ public void setup() throws IOException { // Upload sharding configuration in the flat format expected by SpannerToSourceDb LOG.info("Generating and uploading flat sharding configuration to GCS..."); createAndUploadShardConfigToGcs(); + + // Store original node counts for cleanup + originalSpannerNodeCount = getSpannerNodeCount(spannerResourceManager.getInstanceId()); + originalSpannerMetadataNodeCount = + getSpannerNodeCount(spannerMetadataResourceManager.getInstanceId()); } @After @@ -129,6 +136,15 @@ public void tearDown() { LOG.warn("Failed to reset Spanner node count during teardown: ", e); } } + // Reset Spanner Metadata instance to its original node count if it was modified + if (originalSpannerMetadataNodeCount != null && spannerMetadataResourceManager != null) { + try { + updateSpannerNodeCount( + spannerMetadataResourceManager.getInstanceId(), originalSpannerMetadataNodeCount); + } catch (Exception e) { + LOG.warn("Failed to reset Spanner Metadata node count during teardown: ", e); + } + } cleanupResourceManagers(); if (orchestrator != null) { @@ -137,19 +153,20 @@ public void tearDown() { } @Test - public void test1_setupAndConnectivitySanity() throws IOException { - LOG.info("Running Step 1 Setup and Connectivity Sanity check..."); + public void backlogReplicationSanityE2E() throws IOException, InterruptedException { + LOG.info("Running Consolidated Backlog Replication E2E Sanity Test..."); - // 1. Verify Spanner Connectivity and Table DDL - LOG.info("Verifying Spanner database connectivity and schema..."); + // ------------------------------------------------------------- + // PHASE 1: Connectivity & Setup Sanity + // ------------------------------------------------------------- + LOG.info("PHASE 1: Verifying Spanner & CloudSQL setup connectivity..."); assertNotNull("Spanner resource manager should be initialized", spannerResourceManager); String testId = "test-id-12345"; - String testPayload = "test-payload-step1"; + String testPayload = "test-payload-sanity"; String testShardId = "shard_0"; - // Write a row directly to Spanner - LOG.info("Writing a test row directly to Spanner..."); + // Write/Read Spanner Ping Row List mutations = new ArrayList<>(); mutations.add( Mutation.newInsertOrUpdateBuilder("MigrationLoadTest") @@ -162,58 +179,150 @@ public void test1_setupAndConnectivitySanity() throws IOException { .build()); spannerResourceManager.write(mutations); - // Read the row back from Spanner to verify - LOG.info("Reading the test row back from Spanner..."); - List results = + List spannerResults = spannerResourceManager.runQuery( String.format( "SELECT Payload FROM MigrationLoadTest WHERE migration_shard_id = '%s' AND Id = '%s'", testShardId, testId)); - assertNotNull("Results from Spanner should not be null", results); - assertEquals("Should return exactly 1 row", 1, results.size()); + assertNotNull("Results from Spanner should not be null", spannerResults); + assertEquals("Should return exactly 1 row", 1, spannerResults.size()); assertEquals( "Payload matches what was written to Spanner", testPayload, - results.get(0).getString("Payload")); - - // Delete test row from Spanner - LOG.info("Deleting test row from Spanner..."); + spannerResults.get(0).getString("Payload")); spannerResourceManager.write( List.of( Mutation.delete( "MigrationLoadTest", com.google.cloud.spanner.Key.of(testShardId, testId)))); - // 2. Verify GCS Artifacts (Session and Sharding configurations) - LOG.info("Verifying GCS configuration artifacts..."); - assertNotNull("GCS resource manager should be initialized", gcsResourceManager); - + // Verify GCS configs exist String sessionGcsPath = getGcsPath(SESSION_FILE_NAME, gcsResourceManager); - LOG.info("Session file GCS Path: {}", sessionGcsPath); assertTrue("Session file should exist on GCS", sessionGcsPath.startsWith("gs://")); - String shardGcsPath = getGcsPath(SOURCE_SHARDS_FILE_NAME, gcsResourceManager); - LOG.info("Shard file GCS Path: {}", shardGcsPath); assertTrue("Shard file should exist on GCS", shardGcsPath.startsWith("gs://")); - // 3. Verify MySQL Connectivity, DDL, and Shards - LOG.info("Verifying CloudSQL MySQL Shard connectivity and DDL..."); + // Verify CloudSQL connectivity CloudSqlResourceManager manager1 = (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); CloudSqlResourceManager manager2 = (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); assertNotNull("Shard 1 resource manager should be initialized", manager1); assertNotNull("Shard 2 resource manager should be initialized", manager2); - - // Write and read from logical database shard0 (on Shard 1 physical instance) verifyMySqlLogicalShard(manager1, "shard0"); - // Write and read from logical database shard1 (on Shard 1 physical instance) verifyMySqlLogicalShard(manager1, "shard1"); - // Write and read from logical database shard2 (on Shard 2 physical instance) verifyMySqlLogicalShard(manager2, "shard2"); - // Write and read from logical database shard3 (on Shard 2 physical instance) verifyMySqlLogicalShard(manager2, "shard3"); - LOG.info("Step 1 Setup and Connectivity Sanity check passed successfully! All systems are GO."); + // ------------------------------------------------------------- + // PHASE 2: Spanner Scale-Up & Avro Import + // ------------------------------------------------------------- + LOG.info("PHASE 2: Scaling Spanner & running Avro Import..."); + + // Record UTC start timestamp before import begins (to serve as change stream start timestamp) + String startTimestamp = java.time.Instant.now().toString(); + LOG.info("Recorded UTC start timestamp for change stream: {}", startTimestamp); + + int scaleNodes = + Integer.parseInt(getProperty("spannerScaleNodes", "25", TestProperties.Type.PROPERTY)); + updateSpannerNodeCount(spannerResourceManager.getInstanceId(), scaleNodes); + + // Verify scale-up + int currentNodeCount = getSpannerNodeCount(spannerResourceManager.getInstanceId()); + assertEquals( + "Spanner instance node count mismatch after scale-up", scaleNodes, currentNodeCount); + + // Run Avro Import with small dataset (100 rows) + String avroInputDir = + getProperty( + "avroInputDir", + "gs://nokill-spanner-to-sourcedb-load/small-data/avro/", + TestProperties.Type.PROPERTY); + long expectedSpannerCount = + Long.parseLong(getProperty("expectedSpannerCount", "100", TestProperties.Type.PROPERTY)); + int importTimeoutMinutes = + Integer.parseInt(getProperty("importTimeoutMinutes", "15", TestProperties.Type.PROPERTY)); + + // Ensure avroInputDir ends with a trailing slash for the classic import template + if (!avroInputDir.endsWith("/")) { + avroInputDir = avroInputDir + "/"; + } + + LOG.info("Avro Input Directory: {}", avroInputDir); + LOG.info("Expected Spanner count: {}", expectedSpannerCount); + + PipelineLauncher.LaunchInfo importJobInfo = launchClassicImportJob(avroInputDir); + assertThatPipeline(importJobInfo).isRunning(); + + PipelineOperator.Result importResult = + pipelineOperator.waitUntilDone( + createConfig(importJobInfo, Duration.ofMinutes(importTimeoutMinutes))); + assertThatResult(importResult).isLaunchFinished(); + + long spannerCount = spannerResourceManager.getRowCount(table); + assertEquals("Spanner database row count mismatch", expectedSpannerCount, spannerCount); + LOG.info("Import Phase successful! Imported {} rows.", spannerCount); + + // ------------------------------------------------------------- + // PHASE 3: Downscale & Reverse Replication E2E Verification + // ------------------------------------------------------------- + // Downscale main Spanner instance to 5 nodes and upscale metadata Spanner instance to 20 nodes + LOG.info( + "Downscaling main Spanner instance to 5 nodes and upscaling metadata instance to 20 nodes before starting replication..."); + updateSpannerNodeCount(spannerResourceManager.getInstanceId(), 5); + updateSpannerNodeCount(spannerMetadataResourceManager.getInstanceId(), 20); + + int reverseTimeoutMinutes = + Integer.parseInt(getProperty("reverseTimeoutMinutes", "10", TestProperties.Type.PROPERTY)); + int maxShardConnections = + Integer.parseInt(getProperty("maxShardConnections", "2000", TestProperties.Type.PROPERTY)); + + PipelineLauncher.LaunchInfo reverseJobInfo = + launchReverseReplicationJob(startTimestamp, 200, 200, "n2-highmem-8", maxShardConnections); + assertThatPipeline(reverseJobInfo).isRunning(); + + // Poll success_record_count metric until it reaches the expected count (100) + long polledCount = 0; + long startTimeMillis = System.currentTimeMillis(); + while (polledCount < expectedSpannerCount) { + if (System.currentTimeMillis() - startTimeMillis > reverseTimeoutMinutes * 60 * 1000) { + throw new RuntimeException( + "Reverse replication sanity check timed out after " + + reverseTimeoutMinutes + + " minutes."); + } + Thread.sleep(30000); // Poll every 30 seconds + Double metricVal = + pipelineLauncher.getMetric( + project, region, reverseJobInfo.jobId(), "success_record_count"); + polledCount = metricVal != null ? metricVal.longValue() : 0; + LOG.info("Polled success_record_count: {}. Target: {}", polledCount, expectedSpannerCount); + } + + // Verify database parity on MySQL shards + LOG.info( + "Replication threshold reached. Verifying logical databases row counts on CloudSQL..."); + long count0 = getLogicalDatabaseRowCount(manager1, "shard0"); + long count1 = getLogicalDatabaseRowCount(manager1, "shard1"); + long count2 = getLogicalDatabaseRowCount(manager2, "shard2"); + long count3 = getLogicalDatabaseRowCount(manager2, "shard3"); + + LOG.info( + "Logical databases replicated row counts: shard0={}, shard1={}, shard2={}, shard3={}", + count0, + count1, + count2, + count3); + assertEquals("shard0 row count mismatch", 0L, count0); + assertEquals("shard1 row count mismatch", expectedSpannerCount, count1); + assertEquals("shard2 row count mismatch", 0L, count2); + assertEquals("shard3 row count mismatch", 0L, count3); + + LOG.info("All systems and replication components verified E2E! Cancelling job..."); + PipelineOperator.Result cancelResult = + pipelineOperator.cancelJobAndFinish(createConfig(reverseJobInfo, Duration.ofMinutes(5))); + assertThatResult(cancelResult).isLaunchFinished(); + + LOG.info("Consolidated Backlog Replication E2E Sanity Test passed successfully!"); } private void verifyMySqlLogicalShard(CloudSqlResourceManager manager, String dbName) { @@ -288,64 +397,6 @@ private JsonObject createShardConfig( return jsObj; } - @Test - public void test2_avroImportSanity() throws IOException, InterruptedException { - LOG.info("Running Step 2 Avro Import Sanity check..."); - - // Get parameters - String avroInputDir = - getProperty( - "avroInputDir", - "gs://nokill-spanner-to-sourcedb-load/small-data/avro/", - TestProperties.Type.PROPERTY); - long expectedSpannerCount = - Long.parseLong(getProperty("expectedSpannerCount", "100", TestProperties.Type.PROPERTY)); - int importTimeoutMinutes = - Integer.parseInt(getProperty("importTimeoutMinutes", "15", TestProperties.Type.PROPERTY)); - - // Ensure avroInputDir ends with a trailing slash for the classic import template - if (!avroInputDir.endsWith("/")) { - avroInputDir = avroInputDir + "/"; - } - - LOG.info("Avro Input Directory: {}", avroInputDir); - LOG.info("Expected Spanner count: {}", expectedSpannerCount); - - int scaleNodes = - Integer.parseInt(getProperty("spannerScaleNodes", "25", TestProperties.Type.PROPERTY)); - - LOG.info("Scaling up Spanner instance to {} nodes before starting import job...", scaleNodes); - updateSpannerNodeCount(spannerResourceManager.getInstanceId(), scaleNodes); - - // Assert/Verify that the node count was updated successfully - int currentNodeCount = getSpannerNodeCount(spannerResourceManager.getInstanceId()); - LOG.info("Verified current Spanner instance node count is: {}", currentNodeCount); - assertEquals( - "Spanner instance node count mismatch after scale-up", scaleNodes, currentNodeCount); - - // Launch Avro-to-Spanner import job - LOG.info("Launching classic GCS Avro to Cloud Spanner Import job with a small dataset..."); - PipelineLauncher.LaunchInfo importJobInfo = launchClassicImportJob(avroInputDir); - assertThatPipeline(importJobInfo).isRunning(); - - // Wait for the Import job to complete with a fail-fast timeout (e.g., 15 minutes) - LOG.info( - "Waiting for Spanner import job to finish (timeout: {} mins)...", importTimeoutMinutes); - PipelineOperator.Result importResult = - pipelineOperator.waitUntilDone( - createConfig(importJobInfo, Duration.ofMinutes(importTimeoutMinutes))); - - assertThatResult(importResult).isLaunchFinished(); - LOG.info("Import job completed successfully."); - - // Assert that the rows have been imported correctly into Spanner - long spannerCount = spannerResourceManager.getRowCount(table); - LOG.info("Spanner database row count after import: {}", spannerCount); - assertEquals("Spanner database row count mismatch", expectedSpannerCount, spannerCount); - - LOG.info("Step 2 Avro Import Sanity check passed successfully! Import is verified."); - } - private PipelineLauncher.LaunchInfo launchClassicImportJob(String inputDir) throws IOException { ClassicTemplateClient classicClient = ClassicTemplateClient.builder(CREDENTIALS).build(); @@ -367,10 +418,56 @@ private PipelineLauncher.LaunchInfo launchClassicImportJob(String inputDir) thro return classicClient.launch(project, region, options); } - /** - * Programmatically updates the node count of the Spanner instance. Useful for scaling up before - * heavy loads and downscaling afterwards. - */ + private PipelineLauncher.LaunchInfo launchReverseReplicationJob( + String startTimestamp, + int numWorkers, + int maxWorkers, + String machineType, + int maxShardConnections) + throws IOException { + + Map params = new HashMap<>(); + params.put("changeStreamName", "MigrationStream"); + params.put("instanceId", spannerResourceManager.getInstanceId()); + params.put("databaseId", spannerResourceManager.getDatabaseId()); + params.put("spannerProjectId", project); + params.put("metadataInstance", spannerMetadataResourceManager.getInstanceId()); + params.put("metadataDatabase", spannerMetadataResourceManager.getDatabaseId()); + params.put("sourceShardsFilePath", getGcsPath(SOURCE_SHARDS_FILE_NAME, gcsResourceManager)); + params.put("deadLetterQueueDirectory", getGcsPath("dlq", gcsResourceManager)); + params.put("startTimestamp", startTimestamp); + params.put("maxShardConnections", String.valueOf(maxShardConnections)); + params.put("sessionFilePath", getGcsPath(SESSION_FILE_NAME, gcsResourceManager)); + params.put("workerMachineType", machineType); + + PipelineLauncher.LaunchConfig.Builder options = + PipelineLauncher.LaunchConfig.builder(getClass().getSimpleName(), TEMPLATE_SPEC_PATH); + options + .addEnvironment("maxWorkers", maxWorkers) + .addEnvironment("numWorkers", numWorkers) + .addEnvironment("machineType", machineType) + .addEnvironment( + "additionalExperiments", java.util.Collections.singletonList("use_runner_v2")); + + options.setParameters(params); + return pipelineLauncher.launch(project, region, options.build()); + } + + private long getLogicalDatabaseRowCount(CloudSqlResourceManager manager, String dbName) { + String query = "SELECT COUNT(*) FROM " + dbName + ".MigrationLoadTest"; + List> result = + manager.runSQLQuery(query); // Using runSQLQuery to execute simple counting query + if (result != null && !result.isEmpty()) { + Map row = result.get(0); + for (Object val : row.values()) { + if (val instanceof Number) { + return ((Number) val).longValue(); + } + } + } + return 0; + } + public void updateSpannerNodeCount(String instanceId, int nodeCount) { com.google.cloud.spanner.SpannerOptions options = com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); @@ -378,20 +475,20 @@ public void updateSpannerNodeCount(String instanceId, int nodeCount) { com.google.cloud.spanner.InstanceAdminClient instanceAdminClient = spanner.getInstanceAdminClient(); - // Capture the original node count before the first modification - if (originalSpannerNodeCount == null) { - com.google.cloud.spanner.Instance instance = instanceAdminClient.getInstance(instanceId); - originalSpannerNodeCount = instance.getNodeCount(); - LOG.info( - "Captured original Spanner instance {} node count: {}", - instanceId, - originalSpannerNodeCount); + int fromCount = -1; + if (spannerResourceManager != null + && instanceId.equals(spannerResourceManager.getInstanceId())) { + fromCount = originalSpannerNodeCount != null ? originalSpannerNodeCount : -1; + } else if (spannerMetadataResourceManager != null + && instanceId.equals(spannerMetadataResourceManager.getInstanceId())) { + fromCount = + originalSpannerMetadataNodeCount != null ? originalSpannerMetadataNodeCount : -1; } LOG.info( "Updating Spanner instance {} node count from {} to {}...", instanceId, - originalSpannerNodeCount, + fromCount, nodeCount); com.google.cloud.spanner.InstanceInfo instanceInfo = com.google.cloud.spanner.InstanceInfo.newBuilder( @@ -409,7 +506,6 @@ public void updateSpannerNodeCount(String instanceId, int nodeCount) { } } - /** Programmatically retrieves the current node count of the Spanner instance. */ public int getSpannerNodeCount(String instanceId) { com.google.cloud.spanner.SpannerOptions options = com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); From c5046d321d00b3e0031c96010445b34432048ab1 Mon Sep 17 00:00:00 2001 From: aasthabharill Date: Thu, 7 May 2026 23:11:44 +0530 Subject: [PATCH 4/6] final test --- .../templates/SpannerToSourceDbBacklogLT.java | 573 ++++++++++++++++++ 1 file changed, 573 insertions(+) create mode 100644 v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java new file mode 100644 index 0000000000..9d6bd1a498 --- /dev/null +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java @@ -0,0 +1,573 @@ +/* + * Copyright (C) 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.templates; + +import static org.apache.beam.it.common.TestProperties.getProperty; +import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatPipeline; +import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatResult; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import com.google.cloud.spanner.Mutation; +import com.google.cloud.spanner.Struct; +import com.google.cloud.teleport.metadata.TemplateLoadTest; +import com.google.cloud.teleport.v2.spanner.migrations.shard.Shard; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import java.io.IOException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.beam.it.common.PipelineLauncher; +import org.apache.beam.it.common.PipelineOperator; +import org.apache.beam.it.common.TestProperties; +import org.apache.beam.it.gcp.cloudsql.CloudSqlResourceManager; +import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator; +import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.DatabaseType; +import org.apache.beam.it.gcp.dataflow.ClassicTemplateClient; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Performance / Load test for {@link SpannerToSourceDb} template. + * + *

Objective: Validate if Reverse Replication pipeline can successfully process a massive backlog + * of 1 Billion rows (1 Terabyte of data) generated and imported into Spanner. + */ +@Category(TemplateLoadTest.class) +@TemplateLoadTest(SpannerToSourceDb.class) +@RunWith(JUnit4.class) +public class SpannerToSourceDbBacklogLT extends SpannerToSourceDbLTBase { + + private static final Logger LOG = LoggerFactory.getLogger(SpannerToSourceDbBacklogLT.class); + + private static final String TEMPLATE_SPEC_PATH = + com.google.common.base.MoreObjects.firstNonNull( + TestProperties.specPath(), "gs://dataflow-templates/latest/flex/Spanner_to_SourceDb"); + + private final String spannerDdlResource = "SpannerToSourceDbBacklogLT/spanner-schema.sql"; + private final String sessionFileResource = "SpannerToSourceDbBacklogLT/session.json"; + private final String table = "MigrationLoadTest"; + + private CloudSqlShardOrchestrator orchestrator; + private Integer originalSpannerNodeCount = null; + private Integer originalSpannerMetadataNodeCount = null; + + @Before + public void setup() throws IOException { + LOG.info( + "Initializing resource managers for High-Scale Backlog Replication E2E Load Test via Orchestrator..."); + + // Setup Spanner database and metadata database, GCS artifact resource manager, and session + // files + setupResourceManagers(spannerDdlResource, sessionFileResource); + + // Initialize the Cloud SQL Shard Orchestrator for dynamic GCP-level provisioning over Private + // IP + orchestrator = + new CloudSqlShardOrchestrator( + DatabaseType.MYSQL, + CloudSqlShardOrchestrator.MYSQL_8_0, + project, + region, + gcsResourceManager); + + Map> shardMap = new HashMap<>(); + shardMap.put("nokill-high-resources-backlog-shard1", List.of("shard0", "shard1")); + shardMap.put("nokill-high-resources-backlog-shard2", List.of("shard2", "shard3")); + + // Initialize the physical instances (reusing existing ones) and logical schemas + orchestrator.initialize(shardMap, "orchestrator_shards_bulk.json"); + + // Create logical table schemas inside each database shard + LOG.info("Creating logical schemas on MySQL shards..."); + CloudSqlResourceManager manager1 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); + CloudSqlResourceManager manager2 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); + createLogicalTableSchema(manager1, "shard0"); + createLogicalTableSchema(manager1, "shard1"); + createLogicalTableSchema(manager2, "shard2"); + createLogicalTableSchema(manager2, "shard3"); + + // Upload sharding configuration in the flat format expected by SpannerToSourceDb + LOG.info("Generating and uploading flat sharding configuration to GCS..."); + createAndUploadShardConfigToGcs(); + + // Store original node counts for cleanup + originalSpannerNodeCount = getSpannerNodeCount(spannerResourceManager.getInstanceId()); + originalSpannerMetadataNodeCount = + getSpannerNodeCount(spannerMetadataResourceManager.getInstanceId()); + } + + @After + public void tearDown() { + LOG.info("Cleaning up resources..."); + + // Reset Spanner instance to its original node count if it was modified + if (originalSpannerNodeCount != null && spannerResourceManager != null) { + try { + updateSpannerNodeCount(spannerResourceManager.getInstanceId(), originalSpannerNodeCount); + } catch (Exception e) { + LOG.warn("Failed to reset Spanner node count during teardown: ", e); + } + } + // Reset Spanner Metadata instance to its original node count if it was modified + if (originalSpannerMetadataNodeCount != null && spannerMetadataResourceManager != null) { + try { + updateSpannerNodeCount( + spannerMetadataResourceManager.getInstanceId(), originalSpannerMetadataNodeCount); + } catch (Exception e) { + LOG.warn("Failed to reset Spanner Metadata node count during teardown: ", e); + } + } + + cleanupResourceManagers(); + if (orchestrator != null) { + orchestrator.cleanup(); + } + } + + @Test + public void reverseReplicationBacklogLoadTest() + throws IOException, java.text.ParseException, InterruptedException { + LOG.info("Running High-Scale Backlog Replication E2E Load Test (1 Billion rows)..."); + + // ------------------------------------------------------------- + // PHASE 1: Connectivity & Setup Sanity + // ------------------------------------------------------------- + LOG.info("PHASE 1: Verifying Spanner & CloudSQL setup connectivity..."); + assertNotNull("Spanner resource manager should be initialized", spannerResourceManager); + + String testId = "test-id-12345"; + String testPayload = "test-payload-sanity"; + String testShardId = "shard_0"; + + // Write/Read Spanner Ping Row + List mutations = new ArrayList<>(); + mutations.add( + Mutation.newInsertOrUpdateBuilder("MigrationLoadTest") + .set("Id") + .to(testId) + .set("Payload") + .to(testPayload) + .set("migration_shard_id") + .to(testShardId) + .build()); + spannerResourceManager.write(mutations); + + List spannerResults = + spannerResourceManager.runQuery( + String.format( + "SELECT Payload FROM MigrationLoadTest WHERE migration_shard_id = '%s' AND Id = '%s'", + testShardId, testId)); + assertNotNull("Results from Spanner should not be null", spannerResults); + assertEquals("Should return exactly 1 row", 1, spannerResults.size()); + assertEquals( + "Payload matches what was written to Spanner", + testPayload, + spannerResults.get(0).getString("Payload")); + spannerResourceManager.write( + List.of( + Mutation.delete( + "MigrationLoadTest", com.google.cloud.spanner.Key.of(testShardId, testId)))); + + // Verify GCS configs exist + String sessionGcsPath = getGcsPath(SESSION_FILE_NAME, gcsResourceManager); + assertTrue("Session file should exist on GCS", sessionGcsPath.startsWith("gs://")); + String shardGcsPath = getGcsPath(SOURCE_SHARDS_FILE_NAME, gcsResourceManager); + assertTrue("Shard file should exist on GCS", shardGcsPath.startsWith("gs://")); + + // Verify CloudSQL connectivity + CloudSqlResourceManager manager1 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); + CloudSqlResourceManager manager2 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); + assertNotNull("Shard 1 resource manager should be initialized", manager1); + assertNotNull("Shard 2 resource manager should be initialized", manager2); + verifyMySqlLogicalShard(manager1, "shard0"); + verifyMySqlLogicalShard(manager1, "shard1"); + verifyMySqlLogicalShard(manager2, "shard2"); + verifyMySqlLogicalShard(manager2, "shard3"); + + // ------------------------------------------------------------- + // PHASE 2: Spanner Scale-Up & Avro Import + // ------------------------------------------------------------- + LOG.info("PHASE 2: Scaling Spanner & running Avro Import..."); + + // Record UTC start timestamp before import begins (to serve as change stream start timestamp) + String startTimestamp = java.time.Instant.now().toString(); + LOG.info("Recorded UTC start timestamp for change stream: {}", startTimestamp); + + int scaleNodes = + Integer.parseInt(getProperty("spannerScaleNodes", "25", TestProperties.Type.PROPERTY)); + updateSpannerNodeCount(spannerResourceManager.getInstanceId(), scaleNodes); + + // Verify scale-up + int currentNodeCount = getSpannerNodeCount(spannerResourceManager.getInstanceId()); + assertEquals( + "Spanner instance node count mismatch after scale-up", scaleNodes, currentNodeCount); + + // Run Avro Import with complete dataset (1 billion rows) + String avroInputDir = + getProperty( + "avroInputDir", + "gs://nokill-spanner-to-sourcedb-load/data/avro/", + TestProperties.Type.PROPERTY); + long expectedSpannerCount = + Long.parseLong( + getProperty("expectedSpannerCount", "1000000000", TestProperties.Type.PROPERTY)); + int importTimeoutMinutes = + Integer.parseInt(getProperty("importTimeoutMinutes", "120", TestProperties.Type.PROPERTY)); + + // Ensure avroInputDir ends with a trailing slash for the classic import template + if (!avroInputDir.endsWith("/")) { + avroInputDir = avroInputDir + "/"; + } + + LOG.info("Avro Input Directory: {}", avroInputDir); + LOG.info("Expected Spanner count: {}", expectedSpannerCount); + + PipelineLauncher.LaunchInfo importJobInfo = launchClassicImportJob(avroInputDir); + assertThatPipeline(importJobInfo).isRunning(); + + PipelineOperator.Result importResult = + pipelineOperator.waitUntilDone( + createConfig(importJobInfo, Duration.ofMinutes(importTimeoutMinutes))); + assertThatResult(importResult).isLaunchFinished(); + + long spannerCount = spannerResourceManager.getRowCount(table); + assertEquals("Spanner database row count mismatch", expectedSpannerCount, spannerCount); + LOG.info("Import Phase successful! Imported {} rows.", spannerCount); + + // ------------------------------------------------------------- + // PHASE 3: Downscale & Reverse Replication E2E Verification + // ------------------------------------------------------------- + // Downscale main Spanner instance to 5 nodes and upscale metadata Spanner instance to 20 nodes + LOG.info( + "Downscaling main Spanner instance to 5 nodes and upscaling metadata instance to 20 nodes before starting replication..."); + updateSpannerNodeCount(spannerResourceManager.getInstanceId(), 5); + updateSpannerNodeCount(spannerMetadataResourceManager.getInstanceId(), 20); + + int reverseTimeoutMinutes = + Integer.parseInt(getProperty("reverseTimeoutMinutes", "600", TestProperties.Type.PROPERTY)); + int maxShardConnections = + Integer.parseInt(getProperty("maxShardConnections", "2000", TestProperties.Type.PROPERTY)); + int numWorkers = + Integer.parseInt(getProperty("numWorkers", "200", TestProperties.Type.PROPERTY)); + int maxWorkers = + Integer.parseInt(getProperty("maxWorkers", "200", TestProperties.Type.PROPERTY)); + String machineType = getProperty("machineType", "n2-highmem-8", TestProperties.Type.PROPERTY); + + long expectedShardCount = + Long.parseLong( + getProperty("expectedShardCount", "250000000", TestProperties.Type.PROPERTY)); + long metricThreshold = + Long.parseLong(getProperty("metricThreshold", "999999999", TestProperties.Type.PROPERTY)); + + PipelineLauncher.LaunchInfo reverseJobInfo = + launchReverseReplicationJob( + startTimestamp, numWorkers, maxWorkers, machineType, maxShardConnections); + assertThatPipeline(reverseJobInfo).isRunning(); + + // Poll success_record_count metric until it reaches the threshold + long polledCount = 0; + long startTimeMillis = System.currentTimeMillis(); + int numShards = 4; + + while (polledCount < metricThreshold) { + if (System.currentTimeMillis() - startTimeMillis > reverseTimeoutMinutes * 60 * 1000) { + throw new RuntimeException( + "Reverse replication load check timed out after " + + reverseTimeoutMinutes + + " minutes."); + } + Thread.sleep(300000); // Poll every 5 minutes + + Double metricVal = + pipelineLauncher.getMetric( + project, region, reverseJobInfo.jobId(), "success_record_count"); + polledCount = metricVal != null ? metricVal.longValue() : 0; + + Double severeErrors = + pipelineLauncher.getMetric(project, region, reverseJobInfo.jobId(), "severe_error_count"); + double severeErrorVal = severeErrors != null ? severeErrors : 0.0; + + Double skippedRecords = + pipelineLauncher.getMetric( + project, region, reverseJobInfo.jobId(), "skipped_record_count"); + double skippedRecordVal = skippedRecords != null ? skippedRecords : 0.0; + + LOG.info("--- PIPELINE PROGRESS UPDATE ---"); + LOG.info( + "Time Elapsed: {} minutes / {} minutes", + (System.currentTimeMillis() - startTimeMillis) / 60000, + reverseTimeoutMinutes); + LOG.info( + "Polled success_record_count: {}. Target threshold: {}", polledCount, metricThreshold); + LOG.info("Severe errors so far: {}", severeErrorVal); + LOG.info("Skipped records so far: {}", skippedRecordVal); + + for (int i = 1; i <= numShards; ++i) { + Double replicationLag = + pipelineLauncher.getMetric( + project, + region, + reverseJobInfo.jobId(), + "replication_lag_in_seconds_Shard" + i + "_MEAN"); + LOG.info( + "Replication Lag Mean Shard{}: {} seconds", + i, + replicationLag != null ? replicationLag : 0.0); + } + LOG.info("---------------------------------"); + } + + // Verify database parity on MySQL shards + LOG.info( + "Replication threshold reached. Verifying logical databases row counts on CloudSQL..."); + long count0 = getLogicalDatabaseRowCount(manager1, "shard0"); + long count1 = getLogicalDatabaseRowCount(manager1, "shard1"); + long count2 = getLogicalDatabaseRowCount(manager2, "shard2"); + long count3 = getLogicalDatabaseRowCount(manager2, "shard3"); + + LOG.info( + "Logical databases replicated row counts: shard0={}, shard1={}, shard2={}, shard3={}", + count0, + count1, + count2, + count3); + assertEquals("shard0 row count mismatch", expectedShardCount, count0); + assertEquals("shard1 row count mismatch", expectedShardCount, count1); + assertEquals("shard2 row count mismatch", expectedShardCount, count2); + assertEquals("shard3 row count mismatch", expectedShardCount, count3); + + LOG.info("All sharded replication backlog counts successfully verified. Cancelling job..."); + PipelineOperator.Result cancelResult = + pipelineOperator.cancelJobAndFinish(createConfig(reverseJobInfo, Duration.ofMinutes(20))); + assertThatResult(cancelResult).isLaunchFinished(); + + exportMetrics(reverseJobInfo, numShards); + LOG.info("High-Scale Backlog Replication E2E Load Test passed successfully!"); + } + + private void verifyMySqlLogicalShard(CloudSqlResourceManager manager, String dbName) { + LOG.info("Verifying logical database: {}...", dbName); + + String testId = "test-id-" + dbName; + String testPayload = "payload-" + dbName; + + // Insert test row + String insertSql = + String.format( + "INSERT INTO %s.MigrationLoadTest (Id, Payload) VALUES ('%s', '%s')", + dbName, testId, testPayload); + manager.runSQLUpdate(insertSql); + + // Query test row back + String selectSql = + String.format("SELECT Payload FROM %s.MigrationLoadTest WHERE Id = '%s'", dbName, testId); + List> result = manager.runSQLQuery(selectSql); + + assertNotNull("Result from MySQL logical shard " + dbName + " should not be null", result); + assertEquals("Should return exactly 1 row", 1, result.size()); + assertEquals( + "Payload matches what was written to " + dbName, testPayload, result.get(0).get("Payload")); + + // Cleanup test row + String deleteSql = + String.format("DELETE FROM %s.MigrationLoadTest WHERE Id = '%s'", dbName, testId); + manager.runSQLUpdate(deleteSql); + LOG.info("Logical database {} verified successfully.", dbName); + } + + private void createLogicalTableSchema(CloudSqlResourceManager manager, String dbName) { + manager.runSQLUpdate( + "CREATE TABLE IF NOT EXISTS " + + dbName + + ".MigrationLoadTest (" + + "Id VARCHAR(36) NOT NULL," + + "Payload LONGTEXT NOT NULL," + + "PRIMARY KEY (Id)" + + ") ENGINE=InnoDB"); + } + + private void createAndUploadShardConfigToGcs() throws IOException { + CloudSqlResourceManager manager1 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); + CloudSqlResourceManager manager2 = + (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); + + JsonArray ja = new JsonArray(); + ja.add(createShardConfig("shard_0", "shard0", manager1)); + ja.add(createShardConfig("shard_1", "shard1", manager1)); + ja.add(createShardConfig("shard_2", "shard2", manager2)); + ja.add(createShardConfig("shard_3", "shard3", manager2)); + + String shardFileContents = ja.toString(); + LOG.info("Shard file contents: {}", shardFileContents); + gcsResourceManager.createArtifact(SOURCE_SHARDS_FILE_NAME, shardFileContents); + } + + private JsonObject createShardConfig( + String logicalShardId, String dbName, CloudSqlResourceManager manager) { + Shard shard = new Shard(); + shard.setLogicalShardId(logicalShardId); + shard.setUser(manager.getUsername()); + shard.setHost(manager.getHost()); + shard.setPassword(manager.getPassword()); + shard.setPort(String.valueOf(manager.getPort())); + shard.setDbName(dbName); + JsonObject jsObj = (JsonObject) new Gson().toJsonTree(shard).getAsJsonObject(); + jsObj.remove("secretManagerUri"); + return jsObj; + } + + private PipelineLauncher.LaunchInfo launchClassicImportJob(String inputDir) throws IOException { + ClassicTemplateClient classicClient = ClassicTemplateClient.builder(CREDENTIALS).build(); + + Map params = new HashMap<>(); + params.put("instanceId", spannerResourceManager.getInstanceId()); + params.put("databaseId", spannerResourceManager.getDatabaseId()); + params.put("inputDir", inputDir); + + PipelineLauncher.LaunchConfig options = + PipelineLauncher.LaunchConfig.builder( + "spanner-avro-import-backlog", + "gs://dataflow-templates/latest/GCS_Avro_to_Cloud_Spanner") + .setParameters(params) + .addEnvironment("numWorkers", 80) + .addEnvironment("maxWorkers", 500) + .addEnvironment("machineType", "n2-highmem-8") + .build(); + + return classicClient.launch(project, region, options); + } + + private PipelineLauncher.LaunchInfo launchReverseReplicationJob( + String startTimestamp, + int numWorkers, + int maxWorkers, + String machineType, + int maxShardConnections) + throws IOException { + + Map params = new HashMap<>(); + params.put("changeStreamName", "MigrationStream"); + params.put("instanceId", spannerResourceManager.getInstanceId()); + params.put("databaseId", spannerResourceManager.getDatabaseId()); + params.put("spannerProjectId", project); + params.put("metadataInstance", spannerMetadataResourceManager.getInstanceId()); + params.put("metadataDatabase", spannerMetadataResourceManager.getDatabaseId()); + params.put("sourceShardsFilePath", getGcsPath(SOURCE_SHARDS_FILE_NAME, gcsResourceManager)); + params.put("deadLetterQueueDirectory", getGcsPath("dlq", gcsResourceManager)); + params.put("startTimestamp", startTimestamp); + params.put("maxShardConnections", String.valueOf(maxShardConnections)); + params.put("sessionFilePath", getGcsPath(SESSION_FILE_NAME, gcsResourceManager)); + params.put("workerMachineType", machineType); + + PipelineLauncher.LaunchConfig.Builder options = + PipelineLauncher.LaunchConfig.builder(getClass().getSimpleName(), TEMPLATE_SPEC_PATH); + options + .addEnvironment("maxWorkers", maxWorkers) + .addEnvironment("numWorkers", numWorkers) + .addEnvironment("machineType", machineType) + .addEnvironment( + "additionalExperiments", java.util.Collections.singletonList("use_runner_v2")); + + options.setParameters(params); + return pipelineLauncher.launch(project, region, options.build()); + } + + private long getLogicalDatabaseRowCount(CloudSqlResourceManager manager, String dbName) { + String query = + "SELECT /*+ SET_VAR(innodb_parallel_read_threads=94) */ COUNT(*) FROM " + + dbName + + ".MigrationLoadTest"; + List> result = manager.runSQLQuery(query); + if (result != null && !result.isEmpty()) { + Map row = result.get(0); + for (Object val : row.values()) { + if (val instanceof Number) { + return ((Number) val).longValue(); + } + } + } + return 0; + } + + public void updateSpannerNodeCount(String instanceId, int nodeCount) { + com.google.cloud.spanner.SpannerOptions options = + com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); + try (com.google.cloud.spanner.Spanner spanner = options.getService()) { + com.google.cloud.spanner.InstanceAdminClient instanceAdminClient = + spanner.getInstanceAdminClient(); + + int fromCount = -1; + if (spannerResourceManager != null + && instanceId.equals(spannerResourceManager.getInstanceId())) { + fromCount = originalSpannerNodeCount != null ? originalSpannerNodeCount : -1; + } else if (spannerMetadataResourceManager != null + && instanceId.equals(spannerMetadataResourceManager.getInstanceId())) { + fromCount = + originalSpannerMetadataNodeCount != null ? originalSpannerMetadataNodeCount : -1; + } + + LOG.info( + "Updating Spanner instance {} node count from {} to {}...", + instanceId, + fromCount, + nodeCount); + com.google.cloud.spanner.InstanceInfo instanceInfo = + com.google.cloud.spanner.InstanceInfo.newBuilder( + com.google.cloud.spanner.InstanceId.of(project, instanceId)) + .setNodeCount(nodeCount) + .build(); + instanceAdminClient + .updateInstance( + instanceInfo, com.google.cloud.spanner.InstanceInfo.InstanceField.NODE_COUNT) + .get(); + LOG.info("Successfully updated Spanner instance {} node count to {}.", instanceId, nodeCount); + } catch (Exception e) { + LOG.error("Failed to update Spanner instance node count.", e); + throw new RuntimeException("Failed to update Spanner node count", e); + } + } + + public int getSpannerNodeCount(String instanceId) { + com.google.cloud.spanner.SpannerOptions options = + com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); + try (com.google.cloud.spanner.Spanner spanner = options.getService()) { + com.google.cloud.spanner.InstanceAdminClient instanceAdminClient = + spanner.getInstanceAdminClient(); + com.google.cloud.spanner.Instance instance = instanceAdminClient.getInstance(instanceId); + return instance.getNodeCount(); + } catch (Exception e) { + LOG.error("Failed to retrieve Spanner instance node count.", e); + throw new RuntimeException("Failed to get Spanner node count", e); + } + } +} From 731188d8f15a98f56641766796cf593a8f8c8cc1 Mon Sep 17 00:00:00 2001 From: aasthabharill Date: Fri, 8 May 2026 12:41:14 +0530 Subject: [PATCH 5/6] productionized test --- .github/workflows/spanner-load-tests.yml | 2 +- .../cloudsql/CloudSqlShardOrchestrator.java | 18 +- .../ShardOrchestrationException.java | 2 +- .../templates/SpannerToSourceDbBacklogLT.java | 397 ++++++------- .../SpannerToSourceDbBacklogStepLT.java | 522 ------------------ 5 files changed, 192 insertions(+), 749 deletions(-) rename {v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting => it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql}/ShardOrchestrationException.java (93%) delete mode 100644 v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java diff --git a/.github/workflows/spanner-load-tests.yml b/.github/workflows/spanner-load-tests.yml index 3c553158a6..fc3d1f82e1 100644 --- a/.github/workflows/spanner-load-tests.yml +++ b/.github/workflows/spanner-load-tests.yml @@ -32,7 +32,7 @@ permissions: write-all jobs: load_tests: name: Spanner Dataflow Templates Load tests - timeout-minutes: 1800 # 30 hours + timeout-minutes: 2100 # 35 hours # Run on any runner that matches all the specified runs-on values. runs-on: [ self-hosted, perf ] steps: diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java index d700409782..30b82d7f29 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java @@ -216,9 +216,10 @@ protected T executeWithRetries( * @param shardMap A mapping of physical instance names to the list of logical DB names to create. * @param artifactName The name of the artifact file (e.g., "shards.json"). * @return The full GCS URI to the generated bulkShardConfig.json. - * @throws RuntimeException if provisioning or creation fails after retries. + * @throws ShardOrchestrationException if provisioning or creation fails after retries. */ - public String initialize(Map> shardMap, String artifactName) { + public String initialize(Map> shardMap, String artifactName) + throws ShardOrchestrationException { this.requestedShardMap = new HashMap<>(shardMap); LOG.info("Initializing shard orchestrator for {} physical instances", shardMap.size()); @@ -233,7 +234,7 @@ public String initialize(Map> shardMap, String artifactName return generateAndUploadConfig(artifactName); } catch (Exception e) { LOG.error("Exception while initializing sharded environment", e); - throw new RuntimeException("Failed to initialize sharded environment", e); + throw new ShardOrchestrationException("Failed to initialize sharded environment", e); } } @@ -402,6 +403,11 @@ protected void createLogicalDatabases() { CloudSqlResourceManager manager = createManager(instanceName); managers.put(instanceName, manager); for (String dbName : dbNames) { + try { + manager.runSQLUpdate("DROP DATABASE IF EXISTS " + dbName); + } catch (Exception e) { + LOG.warn("Failed to drop pre-existing database {} if it existed", dbName, e); + } manager.createDatabase(dbName); } })); @@ -458,16 +464,16 @@ protected void awaitAndShutdownExecutor( executor.shutdown(); try { if (!executor.awaitTermination(60, TimeUnit.MINUTES)) { - throw new RuntimeException(phase + " phase timed out"); + throw new ShardOrchestrationException(phase + " phase timed out"); } for (java.util.concurrent.Future future : futures) { future.get(); } } catch (java.util.concurrent.ExecutionException e) { - throw new RuntimeException(phase + " phase failed", e.getCause()); + throw new ShardOrchestrationException(phase + " phase failed", e.getCause()); } catch (InterruptedException e) { Thread.currentThread().interrupt(); - throw new RuntimeException(phase + " phase interrupted", e); + throw new ShardOrchestrationException(phase + " phase interrupted", e); } } diff --git a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/ShardOrchestrationException.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/ShardOrchestrationException.java similarity index 93% rename from v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/ShardOrchestrationException.java rename to it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/ShardOrchestrationException.java index 8f6f469946..659c9763f5 100644 --- a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/templates/loadtesting/ShardOrchestrationException.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/ShardOrchestrationException.java @@ -13,7 +13,7 @@ * License for the specific language governing permissions and limitations under * the License. */ -package com.google.cloud.teleport.v2.templates.loadtesting; +package org.apache.beam.it.gcp.cloudsql; /** Exception thrown when Cloud SQL shard orchestration fails. */ public class ShardOrchestrationException extends RuntimeException { diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java index 9d6bd1a498..93792c31c0 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java @@ -19,22 +19,30 @@ import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatPipeline; import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatResult; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import com.google.cloud.spanner.Mutation; -import com.google.cloud.spanner.Struct; +import com.google.cloud.spanner.Instance; +import com.google.cloud.spanner.InstanceAdminClient; +import com.google.cloud.spanner.InstanceId; +import com.google.cloud.spanner.InstanceInfo; +import com.google.cloud.spanner.InstanceInfo.InstanceField; +import com.google.cloud.spanner.Spanner; +import com.google.cloud.spanner.SpannerOptions; import com.google.cloud.teleport.metadata.TemplateLoadTest; import com.google.cloud.teleport.v2.spanner.migrations.shard.Shard; +import com.google.common.base.MoreObjects; import com.google.gson.Gson; import com.google.gson.JsonArray; import com.google.gson.JsonObject; import java.io.IOException; +import java.text.ParseException; import java.time.Duration; -import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import org.apache.beam.it.common.PipelineLauncher; import org.apache.beam.it.common.PipelineOperator; import org.apache.beam.it.common.TestProperties; @@ -65,28 +73,24 @@ public class SpannerToSourceDbBacklogLT extends SpannerToSourceDbLTBase { private static final Logger LOG = LoggerFactory.getLogger(SpannerToSourceDbBacklogLT.class); private static final String TEMPLATE_SPEC_PATH = - com.google.common.base.MoreObjects.firstNonNull( + MoreObjects.firstNonNull( TestProperties.specPath(), "gs://dataflow-templates/latest/flex/Spanner_to_SourceDb"); - private final String spannerDdlResource = "SpannerToSourceDbBacklogLT/spanner-schema.sql"; - private final String sessionFileResource = "SpannerToSourceDbBacklogLT/session.json"; - private final String table = "MigrationLoadTest"; + private static final String SPANNER_DDL_RESOURCE = + "SpannerToSourceDbBacklogLT/spanner-schema.sql"; + private static final String SESSION_FILE_RESOURCE = "SpannerToSourceDbBacklogLT/session.json"; + private static final String TABLE = "MigrationLoadTest"; private CloudSqlShardOrchestrator orchestrator; + private CloudSqlResourceManager manager1; + private CloudSqlResourceManager manager2; private Integer originalSpannerNodeCount = null; private Integer originalSpannerMetadataNodeCount = null; @Before public void setup() throws IOException { - LOG.info( - "Initializing resource managers for High-Scale Backlog Replication E2E Load Test via Orchestrator..."); - - // Setup Spanner database and metadata database, GCS artifact resource manager, and session - // files - setupResourceManagers(spannerDdlResource, sessionFileResource); + setupResourceManagers(SPANNER_DDL_RESOURCE, SESSION_FILE_RESOURCE); - // Initialize the Cloud SQL Shard Orchestrator for dynamic GCP-level provisioning over Private - // IP orchestrator = new CloudSqlShardOrchestrator( DatabaseType.MYSQL, @@ -95,26 +99,30 @@ public void setup() throws IOException { region, gcsResourceManager); + // The CloudSQL setup consists of 2 physical shards with 2 logical shards each + String physicalShard1 = + getProperty( + "physicalShard1", "nokill-high-resources-backlog-shard1", TestProperties.Type.PROPERTY); + String physicalShard2 = + getProperty( + "physicalShard2", "nokill-high-resources-backlog-shard2", TestProperties.Type.PROPERTY); + Map> shardMap = new HashMap<>(); - shardMap.put("nokill-high-resources-backlog-shard1", List.of("shard0", "shard1")); - shardMap.put("nokill-high-resources-backlog-shard2", List.of("shard2", "shard3")); + shardMap.put(physicalShard1, List.of("shard0", "shard1")); + shardMap.put(physicalShard2, List.of("shard2", "shard3")); // Initialize the physical instances (reusing existing ones) and logical schemas orchestrator.initialize(shardMap, "orchestrator_shards_bulk.json"); + manager1 = (CloudSqlResourceManager) orchestrator.managers.get(physicalShard1); + manager2 = (CloudSqlResourceManager) orchestrator.managers.get(physicalShard2); - // Create logical table schemas inside each database shard LOG.info("Creating logical schemas on MySQL shards..."); - CloudSqlResourceManager manager1 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); - CloudSqlResourceManager manager2 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); createLogicalTableSchema(manager1, "shard0"); createLogicalTableSchema(manager1, "shard1"); createLogicalTableSchema(manager2, "shard2"); createLogicalTableSchema(manager2, "shard3"); - // Upload sharding configuration in the flat format expected by SpannerToSourceDb - LOG.info("Generating and uploading flat sharding configuration to GCS..."); + LOG.info("Generating and uploading sharding configuration to GCS..."); createAndUploadShardConfigToGcs(); // Store original node counts for cleanup @@ -153,80 +161,25 @@ public void tearDown() { @Test public void reverseReplicationBacklogLoadTest() - throws IOException, java.text.ParseException, InterruptedException { - LOG.info("Running High-Scale Backlog Replication E2E Load Test (1 Billion rows)..."); + throws IOException, ParseException, InterruptedException { // ------------------------------------------------------------- - // PHASE 1: Connectivity & Setup Sanity + // PHASE 1: Import 1 billion rows to Spanner // ------------------------------------------------------------- - LOG.info("PHASE 1: Verifying Spanner & CloudSQL setup connectivity..."); - assertNotNull("Spanner resource manager should be initialized", spannerResourceManager); - - String testId = "test-id-12345"; - String testPayload = "test-payload-sanity"; - String testShardId = "shard_0"; - - // Write/Read Spanner Ping Row - List mutations = new ArrayList<>(); - mutations.add( - Mutation.newInsertOrUpdateBuilder("MigrationLoadTest") - .set("Id") - .to(testId) - .set("Payload") - .to(testPayload) - .set("migration_shard_id") - .to(testShardId) - .build()); - spannerResourceManager.write(mutations); - - List spannerResults = - spannerResourceManager.runQuery( - String.format( - "SELECT Payload FROM MigrationLoadTest WHERE migration_shard_id = '%s' AND Id = '%s'", - testShardId, testId)); - assertNotNull("Results from Spanner should not be null", spannerResults); - assertEquals("Should return exactly 1 row", 1, spannerResults.size()); - assertEquals( - "Payload matches what was written to Spanner", - testPayload, - spannerResults.get(0).getString("Payload")); - spannerResourceManager.write( - List.of( - Mutation.delete( - "MigrationLoadTest", com.google.cloud.spanner.Key.of(testShardId, testId)))); - - // Verify GCS configs exist - String sessionGcsPath = getGcsPath(SESSION_FILE_NAME, gcsResourceManager); - assertTrue("Session file should exist on GCS", sessionGcsPath.startsWith("gs://")); - String shardGcsPath = getGcsPath(SOURCE_SHARDS_FILE_NAME, gcsResourceManager); - assertTrue("Shard file should exist on GCS", shardGcsPath.startsWith("gs://")); - - // Verify CloudSQL connectivity - CloudSqlResourceManager manager1 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); - CloudSqlResourceManager manager2 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); - assertNotNull("Shard 1 resource manager should be initialized", manager1); - assertNotNull("Shard 2 resource manager should be initialized", manager2); - verifyMySqlLogicalShard(manager1, "shard0"); - verifyMySqlLogicalShard(manager1, "shard1"); - verifyMySqlLogicalShard(manager2, "shard2"); - verifyMySqlLogicalShard(manager2, "shard3"); + LOG.info("PHASE 1: Import 1 billion rows to Spanner"); - // ------------------------------------------------------------- - // PHASE 2: Spanner Scale-Up & Avro Import - // ------------------------------------------------------------- - LOG.info("PHASE 2: Scaling Spanner & running Avro Import..."); - - // Record UTC start timestamp before import begins (to serve as change stream start timestamp) + // Record UTC start timestamp before import begins (to serve as start timestamp to reverse + // replication job) String startTimestamp = java.time.Instant.now().toString(); - LOG.info("Recorded UTC start timestamp for change stream: {}", startTimestamp); + LOG.info("Recorded UTC start timestamp for Reverse Replication Job: {}", startTimestamp); + // Node count taken from manual test results available in go/reverse-backlog-manual-tests int scaleNodes = Integer.parseInt(getProperty("spannerScaleNodes", "25", TestProperties.Type.PROPERTY)); updateSpannerNodeCount(spannerResourceManager.getInstanceId(), scaleNodes); - // Verify scale-up + // Verify scale-up - it is critical that the Spanner instance is scaled up, otherwise the + // pipeline might run out of resources or face bottleneck issues. int currentNodeCount = getSpannerNodeCount(spannerResourceManager.getInstanceId()); assertEquals( "Spanner instance node count mismatch after scale-up", scaleNodes, currentNodeCount); @@ -251,7 +204,7 @@ public void reverseReplicationBacklogLoadTest() LOG.info("Avro Input Directory: {}", avroInputDir); LOG.info("Expected Spanner count: {}", expectedSpannerCount); - PipelineLauncher.LaunchInfo importJobInfo = launchClassicImportJob(avroInputDir); + PipelineLauncher.LaunchInfo importJobInfo = launchImportJob(avroInputDir); assertThatPipeline(importJobInfo).isRunning(); PipelineOperator.Result importResult = @@ -259,18 +212,26 @@ public void reverseReplicationBacklogLoadTest() createConfig(importJobInfo, Duration.ofMinutes(importTimeoutMinutes))); assertThatResult(importResult).isLaunchFinished(); - long spannerCount = spannerResourceManager.getRowCount(table); + long spannerCount = spannerResourceManager.getRowCount(TABLE); assertEquals("Spanner database row count mismatch", expectedSpannerCount, spannerCount); LOG.info("Import Phase successful! Imported {} rows.", spannerCount); // ------------------------------------------------------------- - // PHASE 3: Downscale & Reverse Replication E2E Verification + // PHASE 2: Reverse Replication E2E Verification // ------------------------------------------------------------- // Downscale main Spanner instance to 5 nodes and upscale metadata Spanner instance to 20 nodes + // (go/reverse-backlog-manual-tests) + int spannerDownscaleNodes = + Integer.parseInt(getProperty("spannerDownscaleNodes", "5", TestProperties.Type.PROPERTY)); + int metadataScaleNodes = + Integer.parseInt(getProperty("metadataScaleNodes", "20", TestProperties.Type.PROPERTY)); + LOG.info( - "Downscaling main Spanner instance to 5 nodes and upscaling metadata instance to 20 nodes before starting replication..."); - updateSpannerNodeCount(spannerResourceManager.getInstanceId(), 5); - updateSpannerNodeCount(spannerMetadataResourceManager.getInstanceId(), 20); + "Downscaling main Spanner instance to {} nodes and upscaling metadata instance to {} nodes before starting replication...", + spannerDownscaleNodes, + metadataScaleNodes); + updateSpannerNodeCount(spannerResourceManager.getInstanceId(), spannerDownscaleNodes); + updateSpannerNodeCount(spannerMetadataResourceManager.getInstanceId(), metadataScaleNodes); int reverseTimeoutMinutes = Integer.parseInt(getProperty("reverseTimeoutMinutes", "600", TestProperties.Type.PROPERTY)); @@ -282,18 +243,26 @@ public void reverseReplicationBacklogLoadTest() Integer.parseInt(getProperty("maxWorkers", "200", TestProperties.Type.PROPERTY)); String machineType = getProperty("machineType", "n2-highmem-8", TestProperties.Type.PROPERTY); - long expectedShardCount = - Long.parseLong( - getProperty("expectedShardCount", "250000000", TestProperties.Type.PROPERTY)); - long metricThreshold = - Long.parseLong(getProperty("metricThreshold", "999999999", TestProperties.Type.PROPERTY)); - PipelineLauncher.LaunchInfo reverseJobInfo = launchReverseReplicationJob( startTimestamp, numWorkers, maxWorkers, machineType, maxShardConnections); assertThatPipeline(reverseJobInfo).isRunning(); - // Poll success_record_count metric until it reaches the threshold + // This is a long running test (7-8 hours) and we don't expect the SQL count queries to pass + // the assertion for the first few hours (when the replication backlog is being processed). + // Asserting on direct database counts during early stages would be highly inefficient and put + // unnecessary resource contention on the database shards. Thus, we poll the + // "success_record_count" metric and only start asserting on SQL counts once the metric + // threshold is met. + // Note that the above metric can NOT act as a reliable source of truth for the success of a + // replication pipeline. It is ONLY used as an indicator to start the SQL count verification. + + long expectedShardCount = + Long.parseLong( + getProperty("expectedShardCount", "250000000", TestProperties.Type.PROPERTY)); + long metricThreshold = + Long.parseLong(getProperty("metricThreshold", "1000000000", TestProperties.Type.PROPERTY)); + long polledCount = 0; long startTimeMillis = System.currentTimeMillis(); int numShards = 4; @@ -305,21 +274,11 @@ public void reverseReplicationBacklogLoadTest() + reverseTimeoutMinutes + " minutes."); } - Thread.sleep(300000); // Poll every 5 minutes - Double metricVal = + Double successRecordsCount = pipelineLauncher.getMetric( project, region, reverseJobInfo.jobId(), "success_record_count"); - polledCount = metricVal != null ? metricVal.longValue() : 0; - - Double severeErrors = - pipelineLauncher.getMetric(project, region, reverseJobInfo.jobId(), "severe_error_count"); - double severeErrorVal = severeErrors != null ? severeErrors : 0.0; - - Double skippedRecords = - pipelineLauncher.getMetric( - project, region, reverseJobInfo.jobId(), "skipped_record_count"); - double skippedRecordVal = skippedRecords != null ? skippedRecords : 0.0; + polledCount = successRecordsCount != null ? successRecordsCount.longValue() : 0; LOG.info("--- PIPELINE PROGRESS UPDATE ---"); LOG.info( @@ -328,87 +287,107 @@ public void reverseReplicationBacklogLoadTest() reverseTimeoutMinutes); LOG.info( "Polled success_record_count: {}. Target threshold: {}", polledCount, metricThreshold); - LOG.info("Severe errors so far: {}", severeErrorVal); - LOG.info("Skipped records so far: {}", skippedRecordVal); - - for (int i = 1; i <= numShards; ++i) { - Double replicationLag = - pipelineLauncher.getMetric( - project, - region, - reverseJobInfo.jobId(), - "replication_lag_in_seconds_Shard" + i + "_MEAN"); - LOG.info( - "Replication Lag Mean Shard{}: {} seconds", - i, - replicationLag != null ? replicationLag : 0.0); - } LOG.info("---------------------------------"); + + if (polledCount >= metricThreshold) { + break; + } + + Thread.sleep( + 900000); // Poll every 15 minutes. Since the test runs for 7-8 hours, 15-minute intervals + // print exactly 4 logs per hour, preventing clutter and API call costs. } - // Verify database parity on MySQL shards + // Verify database parity on MySQL shards with a retry loop to handle minor replication + // synchronization lag LOG.info( - "Replication threshold reached. Verifying logical databases row counts on CloudSQL..."); - long count0 = getLogicalDatabaseRowCount(manager1, "shard0"); - long count1 = getLogicalDatabaseRowCount(manager1, "shard1"); - long count2 = getLogicalDatabaseRowCount(manager2, "shard2"); - long count3 = getLogicalDatabaseRowCount(manager2, "shard3"); + "Replication threshold reached. Verifying logical databases row counts on CloudSQL with a catch-up retry loop..."); + long verificationStartTime = System.currentTimeMillis(); + int verificationTimeoutMinutes = + Integer.parseInt( + getProperty("verificationTimeoutMinutes", "30", TestProperties.Type.PROPERTY)); + long verificationTimeoutMs = + verificationTimeoutMinutes + * 60 + * 1000; // Configurable minutes timeout for final parity catch-up + boolean parityAchieved = false; + long count0 = 0, count1 = 0, count2 = 0, count3 = 0; + + // We execute the logical database row counts using a parallel thread pool instead of running + // them sequentially. At 1-billion row scale, executing sequential SELECT COUNT(*) on 4 massive + // MySQL database shards sequentially takes around 12 minutes per iteration, causing the + // catch-up verification timeout to exhaust after only two/three iterations. Running in parallel + // reduces + // the query round-trip time to the slowest single shard query (~3 minutes), ensuring the loop + // has + // ample opportunities to poll and catch up. + ExecutorService executor = Executors.newFixedThreadPool(4); + try { + while (System.currentTimeMillis() - verificationStartTime < verificationTimeoutMs) { + CompletableFuture f0 = + CompletableFuture.supplyAsync( + () -> getLogicalDatabaseRowCount(manager1, "shard0"), executor); + CompletableFuture f1 = + CompletableFuture.supplyAsync( + () -> getLogicalDatabaseRowCount(manager1, "shard1"), executor); + CompletableFuture f2 = + CompletableFuture.supplyAsync( + () -> getLogicalDatabaseRowCount(manager2, "shard2"), executor); + CompletableFuture f3 = + CompletableFuture.supplyAsync( + () -> getLogicalDatabaseRowCount(manager2, "shard3"), executor); + + // Block and wait for all parallel database count queries to resolve and return their values + // - neccessary for the subsequent if block to compute correctly + count0 = f0.join(); + count1 = f1.join(); + count2 = f2.join(); + count3 = f3.join(); - LOG.info( - "Logical databases replicated row counts: shard0={}, shard1={}, shard2={}, shard3={}", - count0, - count1, - count2, - count3); - assertEquals("shard0 row count mismatch", expectedShardCount, count0); - assertEquals("shard1 row count mismatch", expectedShardCount, count1); - assertEquals("shard2 row count mismatch", expectedShardCount, count2); - assertEquals("shard3 row count mismatch", expectedShardCount, count3); + LOG.info( + "Polled replicated row counts: shard0={}, shard1={}, shard2={}, shard3={} (Target: {})", + count0, + count1, + count2, + count3, + expectedShardCount); + + if (count0 == expectedShardCount + && count1 == expectedShardCount + && count2 == expectedShardCount + && count3 == expectedShardCount) { + parityAchieved = true; + break; + } + + LOG.info( + "Database counts have not reached exact target parity yet. Retrying in 1 minute..."); + Thread.sleep(60000); // Polling retry interval of 1 minute + } + } finally { + executor.shutdown(); + } + + assertTrue( + String.format( + "Logical database row count mismatch after replication verification timeout. Replicated: shard0=%d, shard1=%d, shard2=%d, shard3=%d (Expected: %d each)", + count0, count1, count2, count3, expectedShardCount), + parityAchieved); LOG.info("All sharded replication backlog counts successfully verified. Cancelling job..."); PipelineOperator.Result cancelResult = pipelineOperator.cancelJobAndFinish(createConfig(reverseJobInfo, Duration.ofMinutes(20))); assertThatResult(cancelResult).isLaunchFinished(); - exportMetrics(reverseJobInfo, numShards); - LOG.info("High-Scale Backlog Replication E2E Load Test passed successfully!"); - } - - private void verifyMySqlLogicalShard(CloudSqlResourceManager manager, String dbName) { - LOG.info("Verifying logical database: {}...", dbName); - - String testId = "test-id-" + dbName; - String testPayload = "payload-" + dbName; - - // Insert test row - String insertSql = - String.format( - "INSERT INTO %s.MigrationLoadTest (Id, Payload) VALUES ('%s', '%s')", - dbName, testId, testPayload); - manager.runSQLUpdate(insertSql); - - // Query test row back - String selectSql = - String.format("SELECT Payload FROM %s.MigrationLoadTest WHERE Id = '%s'", dbName, testId); - List> result = manager.runSQLQuery(selectSql); - - assertNotNull("Result from MySQL logical shard " + dbName + " should not be null", result); - assertEquals("Should return exactly 1 row", 1, result.size()); - assertEquals( - "Payload matches what was written to " + dbName, testPayload, result.get(0).get("Payload")); - - // Cleanup test row - String deleteSql = - String.format("DELETE FROM %s.MigrationLoadTest WHERE Id = '%s'", dbName, testId); - manager.runSQLUpdate(deleteSql); - LOG.info("Logical database {} verified successfully.", dbName); } private void createLogicalTableSchema(CloudSqlResourceManager manager, String dbName) { manager.runSQLUpdate( "CREATE TABLE IF NOT EXISTS " + dbName - + ".MigrationLoadTest (" + + "." + + TABLE + + " (" + "Id VARCHAR(36) NOT NULL," + "Payload LONGTEXT NOT NULL," + "PRIMARY KEY (Id)" @@ -416,11 +395,6 @@ private void createLogicalTableSchema(CloudSqlResourceManager manager, String db } private void createAndUploadShardConfigToGcs() throws IOException { - CloudSqlResourceManager manager1 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); - CloudSqlResourceManager manager2 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); - JsonArray ja = new JsonArray(); ja.add(createShardConfig("shard_0", "shard0", manager1)); ja.add(createShardConfig("shard_1", "shard1", manager1)); @@ -446,7 +420,7 @@ private JsonObject createShardConfig( return jsObj; } - private PipelineLauncher.LaunchInfo launchClassicImportJob(String inputDir) throws IOException { + private PipelineLauncher.LaunchInfo launchImportJob(String inputDir) throws IOException { ClassicTemplateClient classicClient = ClassicTemplateClient.builder(CREDENTIALS).build(); Map params = new HashMap<>(); @@ -460,7 +434,7 @@ private PipelineLauncher.LaunchInfo launchClassicImportJob(String inputDir) thro "gs://dataflow-templates/latest/GCS_Avro_to_Cloud_Spanner") .setParameters(params) .addEnvironment("numWorkers", 80) - .addEnvironment("maxWorkers", 500) + .addEnvironment("maxWorkers", 120) .addEnvironment("machineType", "n2-highmem-8") .build(); @@ -503,10 +477,17 @@ private PipelineLauncher.LaunchInfo launchReverseReplicationJob( } private long getLogicalDatabaseRowCount(CloudSqlResourceManager manager, String dbName) { + // We use the InnoDB Optimizer hint 'SET_VAR(innodb_parallel_read_threads=94)' to explicitly + // instruct + // MySQL to leverage parallel read threads for the COUNT(*) operation. On massive shards + // containing + // 250 million rows, this utilizes multiple CPU cores of the high-resource Cloud SQL instance, + // accelerating the full-table scan from over 10 minutes down to 2 minutes. String query = "SELECT /*+ SET_VAR(innodb_parallel_read_threads=94) */ COUNT(*) FROM " + dbName - + ".MigrationLoadTest"; + + "." + + TABLE; List> result = manager.runSQLQuery(query); if (result != null && !result.isEmpty()) { Map row = result.get(0); @@ -520,36 +501,16 @@ private long getLogicalDatabaseRowCount(CloudSqlResourceManager manager, String } public void updateSpannerNodeCount(String instanceId, int nodeCount) { - com.google.cloud.spanner.SpannerOptions options = - com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); - try (com.google.cloud.spanner.Spanner spanner = options.getService()) { - com.google.cloud.spanner.InstanceAdminClient instanceAdminClient = - spanner.getInstanceAdminClient(); - - int fromCount = -1; - if (spannerResourceManager != null - && instanceId.equals(spannerResourceManager.getInstanceId())) { - fromCount = originalSpannerNodeCount != null ? originalSpannerNodeCount : -1; - } else if (spannerMetadataResourceManager != null - && instanceId.equals(spannerMetadataResourceManager.getInstanceId())) { - fromCount = - originalSpannerMetadataNodeCount != null ? originalSpannerMetadataNodeCount : -1; - } + SpannerOptions options = SpannerOptions.newBuilder().setProjectId(project).build(); + try (Spanner spanner = options.getService()) { + InstanceAdminClient instanceAdminClient = spanner.getInstanceAdminClient(); - LOG.info( - "Updating Spanner instance {} node count from {} to {}...", - instanceId, - fromCount, - nodeCount); - com.google.cloud.spanner.InstanceInfo instanceInfo = - com.google.cloud.spanner.InstanceInfo.newBuilder( - com.google.cloud.spanner.InstanceId.of(project, instanceId)) + LOG.info("Updating Spanner instance {} node count to {}...", instanceId, nodeCount); + InstanceInfo instanceInfo = + InstanceInfo.newBuilder(InstanceId.of(project, instanceId)) .setNodeCount(nodeCount) .build(); - instanceAdminClient - .updateInstance( - instanceInfo, com.google.cloud.spanner.InstanceInfo.InstanceField.NODE_COUNT) - .get(); + instanceAdminClient.updateInstance(instanceInfo, InstanceField.NODE_COUNT).get(); LOG.info("Successfully updated Spanner instance {} node count to {}.", instanceId, nodeCount); } catch (Exception e) { LOG.error("Failed to update Spanner instance node count.", e); @@ -558,12 +519,10 @@ public void updateSpannerNodeCount(String instanceId, int nodeCount) { } public int getSpannerNodeCount(String instanceId) { - com.google.cloud.spanner.SpannerOptions options = - com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); - try (com.google.cloud.spanner.Spanner spanner = options.getService()) { - com.google.cloud.spanner.InstanceAdminClient instanceAdminClient = - spanner.getInstanceAdminClient(); - com.google.cloud.spanner.Instance instance = instanceAdminClient.getInstance(instanceId); + SpannerOptions options = SpannerOptions.newBuilder().setProjectId(project).build(); + try (Spanner spanner = options.getService()) { + InstanceAdminClient instanceAdminClient = spanner.getInstanceAdminClient(); + Instance instance = instanceAdminClient.getInstance(instanceId); return instance.getNodeCount(); } catch (Exception e) { LOG.error("Failed to retrieve Spanner instance node count.", e); diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java deleted file mode 100644 index 2b669ed523..0000000000 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogStepLT.java +++ /dev/null @@ -1,522 +0,0 @@ -/* - * Copyright (C) 2026 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package com.google.cloud.teleport.v2.templates; - -import static org.apache.beam.it.common.TestProperties.getProperty; -import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatPipeline; -import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatResult; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; - -import com.google.cloud.spanner.Mutation; -import com.google.cloud.spanner.Struct; -import com.google.cloud.teleport.metadata.TemplateLoadTest; -import com.google.cloud.teleport.v2.spanner.migrations.shard.Shard; -import com.google.gson.Gson; -import com.google.gson.JsonArray; -import com.google.gson.JsonObject; -import java.io.IOException; -import java.time.Duration; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import org.apache.beam.it.common.PipelineLauncher; -import org.apache.beam.it.common.PipelineOperator; -import org.apache.beam.it.common.TestProperties; -import org.apache.beam.it.gcp.cloudsql.CloudSqlResourceManager; -import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator; -import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.DatabaseType; -import org.apache.beam.it.gcp.dataflow.ClassicTemplateClient; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Progressive, fail-fast Consolidated End-to-End Load/Sanity test for {@link SpannerToSourceDb} - * template. - * - *

Objective: Validate Spanner, GCS config, CloudSQL connectivity (Step 1), classic Avro Import - * template upscaling and execution (Step 2), and SpannerToSourceDb reverse replication E2E sanity - * backlog migration (Step 3) back-to-back in under 15 minutes using a tiny dataset. - */ -@Category(TemplateLoadTest.class) -@TemplateLoadTest(SpannerToSourceDb.class) -@RunWith(JUnit4.class) -public class SpannerToSourceDbBacklogStepLT extends SpannerToSourceDbLTBase { - - private static final Logger LOG = LoggerFactory.getLogger(SpannerToSourceDbBacklogStepLT.class); - - private static final String TEMPLATE_SPEC_PATH = - com.google.common.base.MoreObjects.firstNonNull( - TestProperties.specPath(), "gs://dataflow-templates/latest/flex/Spanner_to_SourceDb"); - - private final String spannerDdlResource = "SpannerToSourceDbBacklogLT/spanner-schema.sql"; - private final String sessionFileResource = "SpannerToSourceDbBacklogLT/session.json"; - private final String table = "MigrationLoadTest"; - - private CloudSqlShardOrchestrator orchestrator; - private Integer originalSpannerNodeCount = null; - private Integer originalSpannerMetadataNodeCount = null; - - @Before - public void setup() throws IOException { - LOG.info("Initializing resource managers for Consolidated E2E Sanity Test via Orchestrator..."); - - // Setup Spanner database and metadata database, GCS artifact resource manager, and session - // files - setupResourceManagers(spannerDdlResource, sessionFileResource); - - // Initialize the Cloud SQL Shard Orchestrator for dynamic GCP-level provisioning over Private - // IP - orchestrator = - new CloudSqlShardOrchestrator( - DatabaseType.MYSQL, - CloudSqlShardOrchestrator.MYSQL_8_0, - project, - region, - gcsResourceManager); - - Map> shardMap = new HashMap<>(); - shardMap.put("nokill-high-resources-backlog-shard1", List.of("shard0", "shard1")); - shardMap.put("nokill-high-resources-backlog-shard2", List.of("shard2", "shard3")); - - // Initialize the physical instances (reusing existing ones) and logical schemas - orchestrator.initialize(shardMap, "orchestrator_shards_bulk.json"); - - // Create logical table schemas inside each database shard - LOG.info("Creating logical schemas on MySQL shards..."); - CloudSqlResourceManager manager1 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); - CloudSqlResourceManager manager2 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); - createLogicalTableSchema(manager1, "shard0"); - createLogicalTableSchema(manager1, "shard1"); - createLogicalTableSchema(manager2, "shard2"); - createLogicalTableSchema(manager2, "shard3"); - - // Upload sharding configuration in the flat format expected by SpannerToSourceDb - LOG.info("Generating and uploading flat sharding configuration to GCS..."); - createAndUploadShardConfigToGcs(); - - // Store original node counts for cleanup - originalSpannerNodeCount = getSpannerNodeCount(spannerResourceManager.getInstanceId()); - originalSpannerMetadataNodeCount = - getSpannerNodeCount(spannerMetadataResourceManager.getInstanceId()); - } - - @After - public void tearDown() { - LOG.info("Cleaning up resources..."); - - // Reset Spanner instance to its original node count if it was modified - if (originalSpannerNodeCount != null && spannerResourceManager != null) { - try { - updateSpannerNodeCount(spannerResourceManager.getInstanceId(), originalSpannerNodeCount); - } catch (Exception e) { - LOG.warn("Failed to reset Spanner node count during teardown: ", e); - } - } - // Reset Spanner Metadata instance to its original node count if it was modified - if (originalSpannerMetadataNodeCount != null && spannerMetadataResourceManager != null) { - try { - updateSpannerNodeCount( - spannerMetadataResourceManager.getInstanceId(), originalSpannerMetadataNodeCount); - } catch (Exception e) { - LOG.warn("Failed to reset Spanner Metadata node count during teardown: ", e); - } - } - - cleanupResourceManagers(); - if (orchestrator != null) { - orchestrator.cleanup(); - } - } - - @Test - public void backlogReplicationSanityE2E() throws IOException, InterruptedException { - LOG.info("Running Consolidated Backlog Replication E2E Sanity Test..."); - - // ------------------------------------------------------------- - // PHASE 1: Connectivity & Setup Sanity - // ------------------------------------------------------------- - LOG.info("PHASE 1: Verifying Spanner & CloudSQL setup connectivity..."); - assertNotNull("Spanner resource manager should be initialized", spannerResourceManager); - - String testId = "test-id-12345"; - String testPayload = "test-payload-sanity"; - String testShardId = "shard_0"; - - // Write/Read Spanner Ping Row - List mutations = new ArrayList<>(); - mutations.add( - Mutation.newInsertOrUpdateBuilder("MigrationLoadTest") - .set("Id") - .to(testId) - .set("Payload") - .to(testPayload) - .set("migration_shard_id") - .to(testShardId) - .build()); - spannerResourceManager.write(mutations); - - List spannerResults = - spannerResourceManager.runQuery( - String.format( - "SELECT Payload FROM MigrationLoadTest WHERE migration_shard_id = '%s' AND Id = '%s'", - testShardId, testId)); - assertNotNull("Results from Spanner should not be null", spannerResults); - assertEquals("Should return exactly 1 row", 1, spannerResults.size()); - assertEquals( - "Payload matches what was written to Spanner", - testPayload, - spannerResults.get(0).getString("Payload")); - spannerResourceManager.write( - List.of( - Mutation.delete( - "MigrationLoadTest", com.google.cloud.spanner.Key.of(testShardId, testId)))); - - // Verify GCS configs exist - String sessionGcsPath = getGcsPath(SESSION_FILE_NAME, gcsResourceManager); - assertTrue("Session file should exist on GCS", sessionGcsPath.startsWith("gs://")); - String shardGcsPath = getGcsPath(SOURCE_SHARDS_FILE_NAME, gcsResourceManager); - assertTrue("Shard file should exist on GCS", shardGcsPath.startsWith("gs://")); - - // Verify CloudSQL connectivity - CloudSqlResourceManager manager1 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); - CloudSqlResourceManager manager2 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); - assertNotNull("Shard 1 resource manager should be initialized", manager1); - assertNotNull("Shard 2 resource manager should be initialized", manager2); - verifyMySqlLogicalShard(manager1, "shard0"); - verifyMySqlLogicalShard(manager1, "shard1"); - verifyMySqlLogicalShard(manager2, "shard2"); - verifyMySqlLogicalShard(manager2, "shard3"); - - // ------------------------------------------------------------- - // PHASE 2: Spanner Scale-Up & Avro Import - // ------------------------------------------------------------- - LOG.info("PHASE 2: Scaling Spanner & running Avro Import..."); - - // Record UTC start timestamp before import begins (to serve as change stream start timestamp) - String startTimestamp = java.time.Instant.now().toString(); - LOG.info("Recorded UTC start timestamp for change stream: {}", startTimestamp); - - int scaleNodes = - Integer.parseInt(getProperty("spannerScaleNodes", "25", TestProperties.Type.PROPERTY)); - updateSpannerNodeCount(spannerResourceManager.getInstanceId(), scaleNodes); - - // Verify scale-up - int currentNodeCount = getSpannerNodeCount(spannerResourceManager.getInstanceId()); - assertEquals( - "Spanner instance node count mismatch after scale-up", scaleNodes, currentNodeCount); - - // Run Avro Import with small dataset (100 rows) - String avroInputDir = - getProperty( - "avroInputDir", - "gs://nokill-spanner-to-sourcedb-load/small-data/avro/", - TestProperties.Type.PROPERTY); - long expectedSpannerCount = - Long.parseLong(getProperty("expectedSpannerCount", "100", TestProperties.Type.PROPERTY)); - int importTimeoutMinutes = - Integer.parseInt(getProperty("importTimeoutMinutes", "15", TestProperties.Type.PROPERTY)); - - // Ensure avroInputDir ends with a trailing slash for the classic import template - if (!avroInputDir.endsWith("/")) { - avroInputDir = avroInputDir + "/"; - } - - LOG.info("Avro Input Directory: {}", avroInputDir); - LOG.info("Expected Spanner count: {}", expectedSpannerCount); - - PipelineLauncher.LaunchInfo importJobInfo = launchClassicImportJob(avroInputDir); - assertThatPipeline(importJobInfo).isRunning(); - - PipelineOperator.Result importResult = - pipelineOperator.waitUntilDone( - createConfig(importJobInfo, Duration.ofMinutes(importTimeoutMinutes))); - assertThatResult(importResult).isLaunchFinished(); - - long spannerCount = spannerResourceManager.getRowCount(table); - assertEquals("Spanner database row count mismatch", expectedSpannerCount, spannerCount); - LOG.info("Import Phase successful! Imported {} rows.", spannerCount); - - // ------------------------------------------------------------- - // PHASE 3: Downscale & Reverse Replication E2E Verification - // ------------------------------------------------------------- - // Downscale main Spanner instance to 5 nodes and upscale metadata Spanner instance to 20 nodes - LOG.info( - "Downscaling main Spanner instance to 5 nodes and upscaling metadata instance to 20 nodes before starting replication..."); - updateSpannerNodeCount(spannerResourceManager.getInstanceId(), 5); - updateSpannerNodeCount(spannerMetadataResourceManager.getInstanceId(), 20); - - int reverseTimeoutMinutes = - Integer.parseInt(getProperty("reverseTimeoutMinutes", "10", TestProperties.Type.PROPERTY)); - int maxShardConnections = - Integer.parseInt(getProperty("maxShardConnections", "2000", TestProperties.Type.PROPERTY)); - - PipelineLauncher.LaunchInfo reverseJobInfo = - launchReverseReplicationJob(startTimestamp, 200, 200, "n2-highmem-8", maxShardConnections); - assertThatPipeline(reverseJobInfo).isRunning(); - - // Poll success_record_count metric until it reaches the expected count (100) - long polledCount = 0; - long startTimeMillis = System.currentTimeMillis(); - while (polledCount < expectedSpannerCount) { - if (System.currentTimeMillis() - startTimeMillis > reverseTimeoutMinutes * 60 * 1000) { - throw new RuntimeException( - "Reverse replication sanity check timed out after " - + reverseTimeoutMinutes - + " minutes."); - } - Thread.sleep(30000); // Poll every 30 seconds - Double metricVal = - pipelineLauncher.getMetric( - project, region, reverseJobInfo.jobId(), "success_record_count"); - polledCount = metricVal != null ? metricVal.longValue() : 0; - LOG.info("Polled success_record_count: {}. Target: {}", polledCount, expectedSpannerCount); - } - - // Verify database parity on MySQL shards - LOG.info( - "Replication threshold reached. Verifying logical databases row counts on CloudSQL..."); - long count0 = getLogicalDatabaseRowCount(manager1, "shard0"); - long count1 = getLogicalDatabaseRowCount(manager1, "shard1"); - long count2 = getLogicalDatabaseRowCount(manager2, "shard2"); - long count3 = getLogicalDatabaseRowCount(manager2, "shard3"); - - LOG.info( - "Logical databases replicated row counts: shard0={}, shard1={}, shard2={}, shard3={}", - count0, - count1, - count2, - count3); - assertEquals("shard0 row count mismatch", 0L, count0); - assertEquals("shard1 row count mismatch", expectedSpannerCount, count1); - assertEquals("shard2 row count mismatch", 0L, count2); - assertEquals("shard3 row count mismatch", 0L, count3); - - LOG.info("All systems and replication components verified E2E! Cancelling job..."); - PipelineOperator.Result cancelResult = - pipelineOperator.cancelJobAndFinish(createConfig(reverseJobInfo, Duration.ofMinutes(5))); - assertThatResult(cancelResult).isLaunchFinished(); - - LOG.info("Consolidated Backlog Replication E2E Sanity Test passed successfully!"); - } - - private void verifyMySqlLogicalShard(CloudSqlResourceManager manager, String dbName) { - LOG.info("Verifying logical database: {}...", dbName); - - String testId = "test-id-" + dbName; - String testPayload = "payload-" + dbName; - - // Insert test row - String insertSql = - String.format( - "INSERT INTO %s.MigrationLoadTest (Id, Payload) VALUES ('%s', '%s')", - dbName, testId, testPayload); - manager.runSQLUpdate(insertSql); - - // Query test row back - String selectSql = - String.format("SELECT Payload FROM %s.MigrationLoadTest WHERE Id = '%s'", dbName, testId); - List> result = manager.runSQLQuery(selectSql); - - assertNotNull("Result from MySQL logical shard " + dbName + " should not be null", result); - assertEquals("Should return exactly 1 row", 1, result.size()); - assertEquals( - "Payload matches what was written to " + dbName, testPayload, result.get(0).get("Payload")); - - // Cleanup test row - String deleteSql = - String.format("DELETE FROM %s.MigrationLoadTest WHERE Id = '%s'", dbName, testId); - manager.runSQLUpdate(deleteSql); - LOG.info("Logical database {} verified successfully.", dbName); - } - - private void createLogicalTableSchema(CloudSqlResourceManager manager, String dbName) { - manager.runSQLUpdate( - "CREATE TABLE IF NOT EXISTS " - + dbName - + ".MigrationLoadTest (" - + "Id VARCHAR(36) NOT NULL," - + "Payload LONGTEXT NOT NULL," - + "PRIMARY KEY (Id)" - + ") ENGINE=InnoDB"); - } - - private void createAndUploadShardConfigToGcs() throws IOException { - CloudSqlResourceManager manager1 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard1"); - CloudSqlResourceManager manager2 = - (CloudSqlResourceManager) orchestrator.managers.get("nokill-high-resources-backlog-shard2"); - - JsonArray ja = new JsonArray(); - ja.add(createShardConfig("shard_0", "shard0", manager1)); - ja.add(createShardConfig("shard_1", "shard1", manager1)); - ja.add(createShardConfig("shard_2", "shard2", manager2)); - ja.add(createShardConfig("shard_3", "shard3", manager2)); - - String shardFileContents = ja.toString(); - LOG.info("Shard file contents: {}", shardFileContents); - gcsResourceManager.createArtifact(SOURCE_SHARDS_FILE_NAME, shardFileContents); - } - - private JsonObject createShardConfig( - String logicalShardId, String dbName, CloudSqlResourceManager manager) { - Shard shard = new Shard(); - shard.setLogicalShardId(logicalShardId); - shard.setUser(manager.getUsername()); - shard.setHost(manager.getHost()); - shard.setPassword(manager.getPassword()); - shard.setPort(String.valueOf(manager.getPort())); - shard.setDbName(dbName); - JsonObject jsObj = (JsonObject) new Gson().toJsonTree(shard).getAsJsonObject(); - jsObj.remove("secretManagerUri"); - return jsObj; - } - - private PipelineLauncher.LaunchInfo launchClassicImportJob(String inputDir) throws IOException { - ClassicTemplateClient classicClient = ClassicTemplateClient.builder(CREDENTIALS).build(); - - Map params = new HashMap<>(); - params.put("instanceId", spannerResourceManager.getInstanceId()); - params.put("databaseId", spannerResourceManager.getDatabaseId()); - params.put("inputDir", inputDir); - - PipelineLauncher.LaunchConfig options = - PipelineLauncher.LaunchConfig.builder( - "spanner-avro-import-sanity", - "gs://dataflow-templates/latest/GCS_Avro_to_Cloud_Spanner") - .setParameters(params) - .addEnvironment("numWorkers", 80) - .addEnvironment("maxWorkers", 120) - .addEnvironment("machineType", "n2-standard-8") - .build(); - - return classicClient.launch(project, region, options); - } - - private PipelineLauncher.LaunchInfo launchReverseReplicationJob( - String startTimestamp, - int numWorkers, - int maxWorkers, - String machineType, - int maxShardConnections) - throws IOException { - - Map params = new HashMap<>(); - params.put("changeStreamName", "MigrationStream"); - params.put("instanceId", spannerResourceManager.getInstanceId()); - params.put("databaseId", spannerResourceManager.getDatabaseId()); - params.put("spannerProjectId", project); - params.put("metadataInstance", spannerMetadataResourceManager.getInstanceId()); - params.put("metadataDatabase", spannerMetadataResourceManager.getDatabaseId()); - params.put("sourceShardsFilePath", getGcsPath(SOURCE_SHARDS_FILE_NAME, gcsResourceManager)); - params.put("deadLetterQueueDirectory", getGcsPath("dlq", gcsResourceManager)); - params.put("startTimestamp", startTimestamp); - params.put("maxShardConnections", String.valueOf(maxShardConnections)); - params.put("sessionFilePath", getGcsPath(SESSION_FILE_NAME, gcsResourceManager)); - params.put("workerMachineType", machineType); - - PipelineLauncher.LaunchConfig.Builder options = - PipelineLauncher.LaunchConfig.builder(getClass().getSimpleName(), TEMPLATE_SPEC_PATH); - options - .addEnvironment("maxWorkers", maxWorkers) - .addEnvironment("numWorkers", numWorkers) - .addEnvironment("machineType", machineType) - .addEnvironment( - "additionalExperiments", java.util.Collections.singletonList("use_runner_v2")); - - options.setParameters(params); - return pipelineLauncher.launch(project, region, options.build()); - } - - private long getLogicalDatabaseRowCount(CloudSqlResourceManager manager, String dbName) { - String query = "SELECT COUNT(*) FROM " + dbName + ".MigrationLoadTest"; - List> result = - manager.runSQLQuery(query); // Using runSQLQuery to execute simple counting query - if (result != null && !result.isEmpty()) { - Map row = result.get(0); - for (Object val : row.values()) { - if (val instanceof Number) { - return ((Number) val).longValue(); - } - } - } - return 0; - } - - public void updateSpannerNodeCount(String instanceId, int nodeCount) { - com.google.cloud.spanner.SpannerOptions options = - com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); - try (com.google.cloud.spanner.Spanner spanner = options.getService()) { - com.google.cloud.spanner.InstanceAdminClient instanceAdminClient = - spanner.getInstanceAdminClient(); - - int fromCount = -1; - if (spannerResourceManager != null - && instanceId.equals(spannerResourceManager.getInstanceId())) { - fromCount = originalSpannerNodeCount != null ? originalSpannerNodeCount : -1; - } else if (spannerMetadataResourceManager != null - && instanceId.equals(spannerMetadataResourceManager.getInstanceId())) { - fromCount = - originalSpannerMetadataNodeCount != null ? originalSpannerMetadataNodeCount : -1; - } - - LOG.info( - "Updating Spanner instance {} node count from {} to {}...", - instanceId, - fromCount, - nodeCount); - com.google.cloud.spanner.InstanceInfo instanceInfo = - com.google.cloud.spanner.InstanceInfo.newBuilder( - com.google.cloud.spanner.InstanceId.of(project, instanceId)) - .setNodeCount(nodeCount) - .build(); - instanceAdminClient - .updateInstance( - instanceInfo, com.google.cloud.spanner.InstanceInfo.InstanceField.NODE_COUNT) - .get(); - LOG.info("Successfully updated Spanner instance {} node count to {}.", instanceId, nodeCount); - } catch (Exception e) { - LOG.error("Failed to update Spanner instance node count.", e); - throw new RuntimeException("Failed to update Spanner node count", e); - } - } - - public int getSpannerNodeCount(String instanceId) { - com.google.cloud.spanner.SpannerOptions options = - com.google.cloud.spanner.SpannerOptions.newBuilder().setProjectId(project).build(); - try (com.google.cloud.spanner.Spanner spanner = options.getService()) { - com.google.cloud.spanner.InstanceAdminClient instanceAdminClient = - spanner.getInstanceAdminClient(); - com.google.cloud.spanner.Instance instance = instanceAdminClient.getInstance(instanceId); - return instance.getNodeCount(); - } catch (Exception e) { - LOG.error("Failed to retrieve Spanner instance node count.", e); - throw new RuntimeException("Failed to get Spanner node count", e); - } - } -} From d04aff31992279fa810488d578e8657367c4bbb0 Mon Sep 17 00:00:00 2001 From: aasthabharill Date: Wed, 13 May 2026 11:43:04 +0530 Subject: [PATCH 6/6] review changes --- .../cloudsql/CloudSqlShardOrchestrator.java | 4 +- .../v2/templates/SpannerToSourceDbLTBase.java | 57 ++++- ...a => SpannerToSourceDbLargeBacklogLT.java} | 208 +++++++++++++----- .../session.json | 0 .../spanner-schema.sql | 0 5 files changed, 204 insertions(+), 65 deletions(-) rename v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/{SpannerToSourceDbBacklogLT.java => SpannerToSourceDbLargeBacklogLT.java} (74%) rename v2/spanner-to-sourcedb/src/test/resources/{SpannerToSourceDbBacklogLT => SpannerToSourceDbLargeBacklogLT}/session.json (100%) rename v2/spanner-to-sourcedb/src/test/resources/{SpannerToSourceDbBacklogLT => SpannerToSourceDbLargeBacklogLT}/spanner-schema.sql (100%) diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java index 30b82d7f29..6e1704f104 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/cloudsql/CloudSqlShardOrchestrator.java @@ -322,7 +322,9 @@ protected String ensureInstanceAndGetIp(String instanceName) protected void createPhysicalInstance(String instanceName) throws IOException, InterruptedException { - String tier = databaseType == DatabaseType.MYSQL ? "db-n1-standard-2" : "db-custom-2-7680"; + String defaultTier = + databaseType == DatabaseType.MYSQL ? "db-n1-standard-2" : "db-custom-2-7680"; + String tier = System.getProperty("cloudSqlInstanceTier", defaultTier); DatabaseInstance instance = new DatabaseInstance() .setName(instanceName) diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbLTBase.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbLTBase.java index 7a2c0c6946..3c30c74348 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbLTBase.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbLTBase.java @@ -32,6 +32,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Random; import org.apache.beam.it.common.PipelineLauncher; import org.apache.beam.it.common.PipelineLauncher.LaunchConfig; import org.apache.beam.it.common.PipelineLauncher.LaunchInfo; @@ -39,6 +40,7 @@ import org.apache.beam.it.common.utils.IORedirectUtil; import org.apache.beam.it.common.utils.ResourceManagerUtils; import org.apache.beam.it.gcp.TemplateLoadTestBase; +import org.apache.beam.it.gcp.TestConstants; import org.apache.beam.it.gcp.artifacts.utils.ArtifactUtils; import org.apache.beam.it.gcp.pubsub.PubsubResourceManager; import org.apache.beam.it.gcp.spanner.SpannerResourceManager; @@ -152,10 +154,57 @@ public SpannerResourceManager createSpannerDatabase(String spannerDdlResourceFil } public SpannerResourceManager createSpannerMetadataDatabase() throws IOException { - SpannerResourceManager spannerMetadataResourceManager = - SpannerResourceManager.builder("rr-meta-" + testName, project, region) - .maybeUseStaticInstance() - .build(); + String metadataInstanceId = System.getProperty("spannerMetadataInstanceId"); + SpannerResourceManager.Builder builder = + SpannerResourceManager.builder("rr-meta-" + testName, project, region); + + if (metadataInstanceId != null && !metadataInstanceId.isEmpty()) { + builder.setInstanceId(metadataInstanceId).useStaticInstance(); + } else { + builder.maybeUseStaticInstance(); + } + + SpannerResourceManager spannerMetadataResourceManager = builder.build(); + + // Collision Detection and Auto-Avoidance + if (spannerResourceManager != null + && spannerMetadataResourceManager + .getInstanceId() + .equals(spannerResourceManager.getInstanceId())) { + + String spannerInstanceId = + System.getProperty("spannerInstanceId"); // check if it was a user defined instance + boolean isTestProject = + java.util.Objects.equals(project, "cloud-teleport-testing") + || java.util.Objects.equals(project, "span-cloud-teleport-testing"); + boolean shouldPickRandomInstance = + com.google.common.base.Strings.isNullOrEmpty(spannerInstanceId) + || java.util.Objects.equals(spannerInstanceId, "teleport"); + + if (isTestProject && shouldPickRandomInstance) { + List staticInstanceList = new ArrayList<>(TestConstants.SPANNER_TEST_INSTANCES); + // Avoid picking the same instance + staticInstanceList.remove(spannerResourceManager.getInstanceId()); + if (!staticInstanceList.isEmpty()) { + String newMetadataInstanceId = + staticInstanceList.get(new Random().nextInt(staticInstanceList.size())); + LOG.info( + "Spanner collision detected. Re-selecting metadata instance to: {}", + newMetadataInstanceId); + spannerMetadataResourceManager = + SpannerResourceManager.builder("rr-meta-" + testName, project, region) + .setInstanceId(newMetadataInstanceId) + .useStaticInstance() + .build(); + } + } else { + LOG.warn( + "WARNING: Both primary and metadata Spanner resource managers are configured to use the same instance: {}. " + + "To isolate resources, consider specifying '-DspannerInstanceId' and '-DspannerMetadataInstanceId' separately.", + spannerResourceManager.getInstanceId()); + } + } + String dummy = "CREATE TABLE IF NOT EXISTS t1(id INT64 ) primary key(id)"; spannerMetadataResourceManager.executeDdlStatement(dummy); return spannerMetadataResourceManager; diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbLargeBacklogLT.java similarity index 74% rename from v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java rename to v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbLargeBacklogLT.java index 93792c31c0..163b337629 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbBacklogLT.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbLargeBacklogLT.java @@ -46,6 +46,7 @@ import org.apache.beam.it.common.PipelineLauncher; import org.apache.beam.it.common.PipelineOperator; import org.apache.beam.it.common.TestProperties; +import org.apache.beam.it.conditions.ConditionCheck; import org.apache.beam.it.gcp.cloudsql.CloudSqlResourceManager; import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator; import org.apache.beam.it.gcp.cloudsql.CloudSqlShardOrchestrator.DatabaseType; @@ -68,19 +69,38 @@ @Category(TemplateLoadTest.class) @TemplateLoadTest(SpannerToSourceDb.class) @RunWith(JUnit4.class) -public class SpannerToSourceDbBacklogLT extends SpannerToSourceDbLTBase { +public class SpannerToSourceDbLargeBacklogLT extends SpannerToSourceDbLTBase { - private static final Logger LOG = LoggerFactory.getLogger(SpannerToSourceDbBacklogLT.class); + private static final Logger LOG = LoggerFactory.getLogger(SpannerToSourceDbLargeBacklogLT.class); private static final String TEMPLATE_SPEC_PATH = MoreObjects.firstNonNull( TestProperties.specPath(), "gs://dataflow-templates/latest/flex/Spanner_to_SourceDb"); private static final String SPANNER_DDL_RESOURCE = - "SpannerToSourceDbBacklogLT/spanner-schema.sql"; - private static final String SESSION_FILE_RESOURCE = "SpannerToSourceDbBacklogLT/session.json"; + "SpannerToSourceDbLargeBacklogLT/spanner-schema.sql"; + private static final String SESSION_FILE_RESOURCE = + "SpannerToSourceDbLargeBacklogLT/session.json"; private static final String TABLE = "MigrationLoadTest"; + private static final String DEFAULT_PHYSICAL_SHARD_1 = "nokill-high-resources-backlog-shard1"; + private static final String DEFAULT_PHYSICAL_SHARD_2 = "nokill-high-resources-backlog-shard2"; + private static final String DEFAULT_SPANNER_SCALE_NODES = "25"; + private static final String DEFAULT_AVRO_INPUT_DIR = + "gs://nokill-spanner-to-sourcedb-load/data/avro/"; + private static final String DEFAULT_EXPECTED_SPANNER_COUNT = "1000000000"; + private static final String DEFAULT_IMPORT_TIMEOUT_MINUTES = "120"; + private static final String DEFAULT_SPANNER_DOWNSCALE_NODES = "5"; + private static final String DEFAULT_METADATA_SCALE_NODES = "20"; + private static final String DEFAULT_REVERSE_TIMEOUT_MINUTES = "600"; + private static final String DEFAULT_MAX_SHARD_CONNECTIONS = "2000"; + private static final String DEFAULT_NUM_WORKERS = "200"; + private static final String DEFAULT_MAX_WORKERS = "200"; + private static final String DEFAULT_MACHINE_TYPE = "n2-highmem-8"; + private static final String DEFAULT_EXPECTED_SHARD_COUNT = "250000000"; + private static final String DEFAULT_METRIC_THRESHOLD = "1000000000"; + private static final String DEFAULT_VERIFICATION_TIMEOUT_MINUTES = "30"; + private CloudSqlShardOrchestrator orchestrator; private CloudSqlResourceManager manager1; private CloudSqlResourceManager manager2; @@ -101,11 +121,9 @@ public void setup() throws IOException { // The CloudSQL setup consists of 2 physical shards with 2 logical shards each String physicalShard1 = - getProperty( - "physicalShard1", "nokill-high-resources-backlog-shard1", TestProperties.Type.PROPERTY); + getProperty("physicalShard1", DEFAULT_PHYSICAL_SHARD_1, TestProperties.Type.PROPERTY); String physicalShard2 = - getProperty( - "physicalShard2", "nokill-high-resources-backlog-shard2", TestProperties.Type.PROPERTY); + getProperty("physicalShard2", DEFAULT_PHYSICAL_SHARD_2, TestProperties.Type.PROPERTY); Map> shardMap = new HashMap<>(); shardMap.put(physicalShard1, List.of("shard0", "shard1")); @@ -175,7 +193,9 @@ public void reverseReplicationBacklogLoadTest() // Node count taken from manual test results available in go/reverse-backlog-manual-tests int scaleNodes = - Integer.parseInt(getProperty("spannerScaleNodes", "25", TestProperties.Type.PROPERTY)); + Integer.parseInt( + getProperty( + "spannerScaleNodes", DEFAULT_SPANNER_SCALE_NODES, TestProperties.Type.PROPERTY)); updateSpannerNodeCount(spannerResourceManager.getInstanceId(), scaleNodes); // Verify scale-up - it is critical that the Spanner instance is scaled up, otherwise the @@ -186,15 +206,19 @@ public void reverseReplicationBacklogLoadTest() // Run Avro Import with complete dataset (1 billion rows) String avroInputDir = - getProperty( - "avroInputDir", - "gs://nokill-spanner-to-sourcedb-load/data/avro/", - TestProperties.Type.PROPERTY); + getProperty("avroInputDir", DEFAULT_AVRO_INPUT_DIR, TestProperties.Type.PROPERTY); long expectedSpannerCount = Long.parseLong( - getProperty("expectedSpannerCount", "1000000000", TestProperties.Type.PROPERTY)); + getProperty( + "expectedSpannerCount", + DEFAULT_EXPECTED_SPANNER_COUNT, + TestProperties.Type.PROPERTY)); int importTimeoutMinutes = - Integer.parseInt(getProperty("importTimeoutMinutes", "120", TestProperties.Type.PROPERTY)); + Integer.parseInt( + getProperty( + "importTimeoutMinutes", + DEFAULT_IMPORT_TIMEOUT_MINUTES, + TestProperties.Type.PROPERTY)); // Ensure avroInputDir ends with a trailing slash for the classic import template if (!avroInputDir.endsWith("/")) { @@ -222,9 +246,15 @@ public void reverseReplicationBacklogLoadTest() // Downscale main Spanner instance to 5 nodes and upscale metadata Spanner instance to 20 nodes // (go/reverse-backlog-manual-tests) int spannerDownscaleNodes = - Integer.parseInt(getProperty("spannerDownscaleNodes", "5", TestProperties.Type.PROPERTY)); + Integer.parseInt( + getProperty( + "spannerDownscaleNodes", + DEFAULT_SPANNER_DOWNSCALE_NODES, + TestProperties.Type.PROPERTY)); int metadataScaleNodes = - Integer.parseInt(getProperty("metadataScaleNodes", "20", TestProperties.Type.PROPERTY)); + Integer.parseInt( + getProperty( + "metadataScaleNodes", DEFAULT_METADATA_SCALE_NODES, TestProperties.Type.PROPERTY)); LOG.info( "Downscaling main Spanner instance to {} nodes and upscaling metadata instance to {} nodes before starting replication...", @@ -234,14 +264,25 @@ public void reverseReplicationBacklogLoadTest() updateSpannerNodeCount(spannerMetadataResourceManager.getInstanceId(), metadataScaleNodes); int reverseTimeoutMinutes = - Integer.parseInt(getProperty("reverseTimeoutMinutes", "600", TestProperties.Type.PROPERTY)); + Integer.parseInt( + getProperty( + "reverseTimeoutMinutes", + DEFAULT_REVERSE_TIMEOUT_MINUTES, + TestProperties.Type.PROPERTY)); int maxShardConnections = - Integer.parseInt(getProperty("maxShardConnections", "2000", TestProperties.Type.PROPERTY)); + Integer.parseInt( + getProperty( + "maxShardConnections", + DEFAULT_MAX_SHARD_CONNECTIONS, + TestProperties.Type.PROPERTY)); int numWorkers = - Integer.parseInt(getProperty("numWorkers", "200", TestProperties.Type.PROPERTY)); + Integer.parseInt( + getProperty("numWorkers", DEFAULT_NUM_WORKERS, TestProperties.Type.PROPERTY)); int maxWorkers = - Integer.parseInt(getProperty("maxWorkers", "200", TestProperties.Type.PROPERTY)); - String machineType = getProperty("machineType", "n2-highmem-8", TestProperties.Type.PROPERTY); + Integer.parseInt( + getProperty("maxWorkers", DEFAULT_MAX_WORKERS, TestProperties.Type.PROPERTY)); + String machineType = + getProperty("machineType", DEFAULT_MACHINE_TYPE, TestProperties.Type.PROPERTY); PipelineLauncher.LaunchInfo reverseJobInfo = launchReverseReplicationJob( @@ -259,44 +300,63 @@ public void reverseReplicationBacklogLoadTest() long expectedShardCount = Long.parseLong( - getProperty("expectedShardCount", "250000000", TestProperties.Type.PROPERTY)); + getProperty( + "expectedShardCount", DEFAULT_EXPECTED_SHARD_COUNT, TestProperties.Type.PROPERTY)); long metricThreshold = - Long.parseLong(getProperty("metricThreshold", "1000000000", TestProperties.Type.PROPERTY)); + Long.parseLong( + getProperty("metricThreshold", DEFAULT_METRIC_THRESHOLD, TestProperties.Type.PROPERTY)); - long polledCount = 0; long startTimeMillis = System.currentTimeMillis(); int numShards = 4; - while (polledCount < metricThreshold) { - if (System.currentTimeMillis() - startTimeMillis > reverseTimeoutMinutes * 60 * 1000) { - throw new RuntimeException( - "Reverse replication load check timed out after " - + reverseTimeoutMinutes - + " minutes."); - } - - Double successRecordsCount = - pipelineLauncher.getMetric( - project, region, reverseJobInfo.jobId(), "success_record_count"); - polledCount = successRecordsCount != null ? successRecordsCount.longValue() : 0; - - LOG.info("--- PIPELINE PROGRESS UPDATE ---"); - LOG.info( - "Time Elapsed: {} minutes / {} minutes", - (System.currentTimeMillis() - startTimeMillis) / 60000, - reverseTimeoutMinutes); - LOG.info( - "Polled success_record_count: {}. Target threshold: {}", polledCount, metricThreshold); - LOG.info("---------------------------------"); - - if (polledCount >= metricThreshold) { - break; - } - - Thread.sleep( - 900000); // Poll every 15 minutes. Since the test runs for 7-8 hours, 15-minute intervals - // print exactly 4 logs per hour, preventing clutter and API call costs. - } + ConditionCheck successRecordsCheck = + new ConditionCheck() { + @Override + protected String getDescription() { + return String.format( + "Check if Dataflow metric success_record_count reaches %d", metricThreshold); + } + + @Override + protected CheckResult check() { + try { + Double successRecordsCount = + pipelineLauncher.getMetric( + project, region, reverseJobInfo.jobId(), "success_record_count"); + long polledCount = successRecordsCount != null ? successRecordsCount.longValue() : 0; + + LOG.info("--- PIPELINE PROGRESS UPDATE ---"); + LOG.info( + "Time Elapsed: {} minutes / {} minutes", + (System.currentTimeMillis() - startTimeMillis) / 60000, + reverseTimeoutMinutes); + LOG.info( + "Polled success_record_count: {} / Target: {}", polledCount, metricThreshold); + LOG.info("---------------------------------"); + + if (polledCount >= metricThreshold) { + return new CheckResult(true, String.format("Threshold reached: %d", polledCount)); + } + return new CheckResult( + false, String.format("Current progress: %d rows", polledCount)); + } catch (Exception e) { + return new CheckResult(false, "Failed to retrieve job metrics: " + e.getMessage()); + } + } + }; + + PipelineOperator.Result result = + pipelineOperator.waitForCondition( + createConfig( + reverseJobInfo, + Duration.ofMinutes(reverseTimeoutMinutes), // total timeout + Duration.ofMinutes( + 15)), // Poll every 15 minutes. Since the test runs for 7-8 hours, 15-minute + // intervals + // print exactly 4 logs per hour, preventing clutter and API call costs. + successRecordsCheck); + + assertThatResult(result).meetsConditions(); // Verify database parity on MySQL shards with a retry loop to handle minor replication // synchronization lag @@ -305,7 +365,10 @@ public void reverseReplicationBacklogLoadTest() long verificationStartTime = System.currentTimeMillis(); int verificationTimeoutMinutes = Integer.parseInt( - getProperty("verificationTimeoutMinutes", "30", TestProperties.Type.PROPERTY)); + getProperty( + "verificationTimeoutMinutes", + DEFAULT_VERIFICATION_TIMEOUT_MINUTES, + TestProperties.Type.PROPERTY)); long verificationTimeoutMs = verificationTimeoutMinutes * 60 @@ -505,15 +568,40 @@ public void updateSpannerNodeCount(String instanceId, int nodeCount) { try (Spanner spanner = options.getService()) { InstanceAdminClient instanceAdminClient = spanner.getInstanceAdminClient(); - LOG.info("Updating Spanner instance {} node count to {}...", instanceId, nodeCount); InstanceInfo instanceInfo = InstanceInfo.newBuilder(InstanceId.of(project, instanceId)) .setNodeCount(nodeCount) .build(); - instanceAdminClient.updateInstance(instanceInfo, InstanceField.NODE_COUNT).get(); - LOG.info("Successfully updated Spanner instance {} node count to {}.", instanceId, nodeCount); + + int maxRetries = 3; + long backoffMs = 10000; // 10 seconds initial backoff + for (int attempt = 1; attempt <= maxRetries; attempt++) { + try { + LOG.info( + "Updating Spanner instance {} node count to {}... (Attempt {}/{})", + instanceId, + nodeCount, + attempt, + maxRetries); + instanceAdminClient.updateInstance(instanceInfo, InstanceField.NODE_COUNT).get(); + LOG.info( + "Successfully updated Spanner instance {} node count to {}.", instanceId, nodeCount); + return; + } catch (Exception e) { + if (attempt == maxRetries) { + throw e; + } + LOG.warn( + "Failed to update Spanner instance node count on attempt {}. Retrying in {} ms...", + attempt, + backoffMs, + e); + Thread.sleep(backoffMs); + backoffMs *= 2; // Exponential backoff + } + } } catch (Exception e) { - LOG.error("Failed to update Spanner instance node count.", e); + LOG.error("Failed to update Spanner instance node count after retries.", e); throw new RuntimeException("Failed to update Spanner node count", e); } } diff --git a/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/session.json b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbLargeBacklogLT/session.json similarity index 100% rename from v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/session.json rename to v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbLargeBacklogLT/session.json diff --git a/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/spanner-schema.sql b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbLargeBacklogLT/spanner-schema.sql similarity index 100% rename from v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbBacklogLT/spanner-schema.sql rename to v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbLargeBacklogLT/spanner-schema.sql