From 2a7da21f32129921961b8927063667e2b3df564c Mon Sep 17 00:00:00 2001 From: chulanovskyi Date: Wed, 25 Mar 2026 14:32:16 +0200 Subject: [PATCH 1/4] feat: replaced `print` with `dbutils.notebook` handler --- reverse_engineering/helpers/pythonScriptGeneratorHelper.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reverse_engineering/helpers/pythonScriptGeneratorHelper.js b/reverse_engineering/helpers/pythonScriptGeneratorHelper.js index aa582da..36696d7 100644 --- a/reverse_engineering/helpers/pythonScriptGeneratorHelper.js +++ b/reverse_engineering/helpers/pythonScriptGeneratorHelper.js @@ -27,14 +27,14 @@ def getDatabaseMetadata(dbName): clusterData = { dbName: getDatabaseMetadata(dbName) for dbName in databasesNames } -print(json.dumps(clusterData)) +dbutils.notebook.exit(json.dumps(clusterData)) `; const getViewNamesCommand = databaseName => ` import json viewNames = spark.sql("show views in ${databaseName}").rdd.map(lambda p: p.viewName).collect() -print(json.dumps(viewNames)) +dbutils.notebook.exit(json.dumps(viewNames)) `; module.exports = { From f266cc9328b3ce97ba429b45bc5393fefa9c3088 Mon Sep 17 00:00:00 2001 From: chulanovskyi Date: Thu, 26 Mar 2026 10:26:47 +0200 Subject: [PATCH 2/4] refactor: moved some helper functions to shared folder --- .../alterScript/alterScriptFromDeltaHelper.js | 2 +- .../alterContainerHelper.js | 2 +- .../alterRelationshipsHelper.js | 10 +---- .../columnHelpers/alterTypeHelper.js | 3 +- .../columnHelpers/checkConstraintHelper.js | 3 +- .../columnHelpers/commentsHelper.js | 3 +- .../columnHelpers/defaultValueHelper.js | 3 +- .../columnHelpers/nonNullConstraintHelper.js | 3 +- .../containerHelpers/alterUnityTagsHelper.js | 1 - .../containerHelpers/commentsHelper.js | 3 +- .../entityHelpers/addColumnsHelper.js | 2 +- .../entityHelpers/primaryKeyHelper.js | 4 +- .../ddlProvider/ddlProvider.js | 4 +- forward_engineering/helpers/columnHelper.js | 4 +- forward_engineering/helpers/databaseHelper.js | 2 +- .../entityHelpers/checkConstraintHelper.js | 2 +- .../helpers/entityHelpers/primaryKeyHelper.js | 3 +- .../helpers/foreignKeyHelper.js | 3 +- forward_engineering/helpers/indexHelper.js | 3 +- .../helpers/jsonSchemaHelper.js | 3 +- forward_engineering/helpers/tableHelper.js | 2 +- forward_engineering/helpers/viewHelper.js | 3 +- .../sampleGeneration/generateSamples.js | 3 +- forward_engineering/utils/general.js | 43 ++----------------- shared/constants.js | 8 ++++ shared/general.js | 39 +++++++++++++++++ .../enums => shared}/reservedWords.js | 0 27 files changed, 87 insertions(+), 74 deletions(-) create mode 100644 shared/general.js rename {forward_engineering/enums => shared}/reservedWords.js (100%) diff --git a/forward_engineering/alterScript/alterScriptFromDeltaHelper.js b/forward_engineering/alterScript/alterScriptFromDeltaHelper.js index 8b2e525..a0249c0 100644 --- a/forward_engineering/alterScript/alterScriptFromDeltaHelper.js +++ b/forward_engineering/alterScript/alterScriptFromDeltaHelper.js @@ -28,7 +28,6 @@ const { isSupportUnityCatalog, getContainerName, replaceSpaceWithUnderscore, - prepareName, executeUnlessStreaming, } = require('../utils/general'); const { getModifyPkConstraintsScripts } = require('./alterScriptHelpers/entityHelpers/primaryKeyHelper'); @@ -36,6 +35,7 @@ const { getAlterRelationshipsScriptDtos } = require('./alterScriptHelpers/alterR const { Runtime } = require('../enums/runtime'); const { AlterScriptDto } = require('./types/AlterScriptDto'); const { getItems } = require('./alterScriptHelpers/columnHelpers/getItems'); +const { prepareName } = require('../../shared/general'); /** * @param scripts {Array} diff --git a/forward_engineering/alterScript/alterScriptHelpers/alterContainerHelper.js b/forward_engineering/alterScript/alterScriptHelpers/alterContainerHelper.js index c1ef642..bdef8f4 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/alterContainerHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/alterContainerHelper.js @@ -3,7 +3,6 @@ const { getDatabaseStatement, getDatabaseAlterStatement, getBucketKeyword } = re const { getEntityData, getIsChangeProperties, - prepareName, replaceSpaceWithUnderscore, isSupportUnityCatalog, } = require('../../utils/general'); @@ -13,6 +12,7 @@ const { getModifyUnityCatalogTagsScriptDtos, getModifyUnitySchemaTagsScriptDtos, } = require('./containerHelpers/alterUnityTagsHelper'); +const { prepareName } = require('../../../shared/general'); const containerProperties = ['comment', 'location', 'dbProperties', 'description']; const otherContainerProperties = ['name', 'location']; diff --git a/forward_engineering/alterScript/alterScriptHelpers/alterRelationshipsHelper.js b/forward_engineering/alterScript/alterScriptHelpers/alterRelationshipsHelper.js index 01bd1e5..fefff15 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/alterRelationshipsHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/alterRelationshipsHelper.js @@ -1,15 +1,9 @@ const _ = require('lodash'); -const { - getFullEntityName, - replaceSpaceWithUnderscore, - prepareName, - getContainerName, - replaceDotWithUnderscore, - executeUnlessStreaming, -} = require('../../utils/general'); +const { getFullEntityName, replaceSpaceWithUnderscore, executeUnlessStreaming } = require('../../utils/general'); const { AlterScriptDto } = require('../types/AlterScriptDto'); const { getUseSchemaScriptDto } = require('./alterEntityHelper'); const { getItems } = require('./columnHelpers/getItems'); +const { prepareName } = require('../../../shared/general'); /** * @param relationship {Object} diff --git a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/alterTypeHelper.js b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/alterTypeHelper.js index cf179f6..d4a4c0b 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/alterTypeHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/alterTypeHelper.js @@ -1,7 +1,8 @@ const _ = require('lodash'); const { AlterScriptDto } = require('../../types/AlterScriptDto'); -const { checkFieldPropertiesChanged, generateFullEntityName, prepareName } = require('../../../utils/general'); +const { checkFieldPropertiesChanged, generateFullEntityName } = require('../../../utils/general'); const { getColumns, getColumnStatement } = require('../../../helpers/columnHelper'); +const { prepareName } = require('../../../../shared/general'); /** * @return {boolean} diff --git a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/checkConstraintHelper.js b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/checkConstraintHelper.js index a885627..e655a03 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/checkConstraintHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/checkConstraintHelper.js @@ -1,6 +1,7 @@ const _ = require('lodash'); -const { generateFullEntityName, wrapInTicks } = require('../../../utils/general'); +const { generateFullEntityName } = require('../../../utils/general'); const { AlterScriptDto } = require('../../types/AlterScriptDto'); +const { wrapInTicks } = require('../../../../shared/general'); /** * @typedef GetAlterScriptDtoFunction diff --git a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/commentsHelper.js b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/commentsHelper.js index 911392a..2406c65 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/commentsHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/commentsHelper.js @@ -1,6 +1,7 @@ const _ = require('lodash'); -const { generateFullEntityName, prepareName, wrapInSingleQuotes } = require('../../../utils/general'); +const { generateFullEntityName, wrapInSingleQuotes } = require('../../../utils/general'); const { AlterScriptDto } = require('../../types/AlterScriptDto'); +const { prepareName } = require('../../../../shared/general'); /** * @return {({ collection, dbVersion }: { collection: Object, dbVersion: string }) => Array} diff --git a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/defaultValueHelper.js b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/defaultValueHelper.js index 87515df..e157a5a 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/defaultValueHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/defaultValueHelper.js @@ -1,6 +1,7 @@ const _ = require('lodash'); const { AlterScriptDto } = require('../../types/AlterScriptDto'); -const { generateFullEntityName, prepareName } = require('../../../utils/general'); +const { generateFullEntityName } = require('../../../utils/general'); +const { prepareName } = require('../../../../shared/general'); /** * @return {({ collection, dbVersion }: { collection: Object, dbVersion: string }) => Array} diff --git a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/nonNullConstraintHelper.js b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/nonNullConstraintHelper.js index 17fdd3c..e5f1ff9 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/nonNullConstraintHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/columnHelpers/nonNullConstraintHelper.js @@ -1,6 +1,7 @@ const _ = require('lodash'); -const { generateFullEntityName, prepareName } = require('../../../utils/general'); +const { generateFullEntityName } = require('../../../utils/general'); const { AlterScriptDto } = require('../../types/AlterScriptDto'); +const { prepareName } = require('../../../../shared/general'); /** * @return {({ collection, dbVersion }: { collection: Object, dbVersion: string }) => Array} diff --git a/forward_engineering/alterScript/alterScriptHelpers/containerHelpers/alterUnityTagsHelper.js b/forward_engineering/alterScript/alterScriptHelpers/containerHelpers/alterUnityTagsHelper.js index e2baf1a..cd505f0 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/containerHelpers/alterUnityTagsHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/containerHelpers/alterUnityTagsHelper.js @@ -1,6 +1,5 @@ const { getUnityTagsFromCompMod, getUnsetTagsNamesParamString } = require('../../../helpers/unityTagsHelper'); const { buildTagPairs } = require('../../../helpers/unityTagsHelper'); -const { prepareName } = require('../../../utils/general'); const { AlterScriptDto } = require('../../types/AlterScriptDto'); /** diff --git a/forward_engineering/alterScript/alterScriptHelpers/containerHelpers/commentsHelper.js b/forward_engineering/alterScript/alterScriptHelpers/containerHelpers/commentsHelper.js index 62f96d2..0ee5d4e 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/containerHelpers/commentsHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/containerHelpers/commentsHelper.js @@ -1,6 +1,7 @@ const { EntitiesThatSupportComments } = require('../../../enums/entityType'); -const { replaceSpaceWithUnderscore, wrapInSingleQuotes, prepareName } = require('../../../utils/general'); +const { replaceSpaceWithUnderscore, wrapInSingleQuotes } = require('../../../utils/general'); const { AlterScriptDto } = require('../../types/AlterScriptDto'); +const { prepareName } = require('../../../../shared/general'); /** * @return {{ diff --git a/forward_engineering/alterScript/alterScriptHelpers/entityHelpers/addColumnsHelper.js b/forward_engineering/alterScript/alterScriptHelpers/entityHelpers/addColumnsHelper.js index 4a4b8cc..b428dca 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/entityHelpers/addColumnsHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/entityHelpers/addColumnsHelper.js @@ -3,7 +3,6 @@ const { getColumns, getColumnsStatement } = require('../../../helpers/columnHelp const { getEntityProperties, generateFullEntityName, - prepareName, getDBVersionNumber, executeUnlessStreaming, } = require('../../../utils/general'); @@ -13,6 +12,7 @@ const { hydrateIndex } = require('./indexHelper'); const { generateModifyCollectionScript } = require('./modifyCollectionScript'); const { Runtime } = require('../../../enums/runtime'); const { getColumnTagsStatement } = require('../../../helpers/unityTagsHelper'); +const { prepareName } = require('../../../../shared/general'); /** * @typedef {{ diff --git a/forward_engineering/alterScript/alterScriptHelpers/entityHelpers/primaryKeyHelper.js b/forward_engineering/alterScript/alterScriptHelpers/entityHelpers/primaryKeyHelper.js index 8d665f0..7535e4a 100644 --- a/forward_engineering/alterScript/alterScriptHelpers/entityHelpers/primaryKeyHelper.js +++ b/forward_engineering/alterScript/alterScriptHelpers/entityHelpers/primaryKeyHelper.js @@ -1,6 +1,6 @@ const _ = require('lodash'); -const { generateFullEntityName, getEntityNameFromCollection, prepareName } = require('../../../utils/general'); -const { AlterScriptDto } = require('../../types/AlterScriptDto'); +const { generateFullEntityName, getEntityNameFromCollection } = require('../../../utils/general'); +const { prepareName } = require('../../../../shared/general'); /** * @return {(collection: Object, guid: string) => Object | undefined} diff --git a/forward_engineering/ddlProvider/ddlProvider.js b/forward_engineering/ddlProvider/ddlProvider.js index f7cbf81..3ac7443 100644 --- a/forward_engineering/ddlProvider/ddlProvider.js +++ b/forward_engineering/ddlProvider/ddlProvider.js @@ -1,10 +1,10 @@ const _ = require('lodash'); const templates = require('./ddlTemplates'); -const { getFullEntityName, replaceSpaceWithUnderscore, prepareName, wrapInBrackets } = require('../utils/general'); +const { getFullEntityName, replaceSpaceWithUnderscore, wrapInBrackets } = require('../utils/general'); const { getViewTagsStatement } = require('../helpers/unityTagsHelper'); const { getTablePropertiesClause, checkTablePropertiesDefined } = require('../helpers/tableHelper'); const viewHelper = require('../helpers/viewHelper'); -const keyHelper = require('../helpers/keyHelper'); +const { prepareName } = require('../../shared/general'); module.exports = app => { const { assignTemplates } = app.require('@hackolade/ddl-fe-utils'); diff --git a/forward_engineering/helpers/columnHelper.js b/forward_engineering/helpers/columnHelper.js index 3d0b937..72dc4e9 100644 --- a/forward_engineering/helpers/columnHelper.js +++ b/forward_engineering/helpers/columnHelper.js @@ -1,10 +1,7 @@ -'use strict'; - const { replaceSpaceWithUnderscore, getName, getTypeDescriptor, - prepareName, commentDeactivatedStatements, encodeStringLiteral, wrapInBrackets, @@ -13,6 +10,7 @@ const { getDBVersionNumber, } = require('../utils/general'); const { getCheckConstraint } = require('./constrainthelper'); +const { prepareName } = require('../../shared/general'); const getStructChild = (name, type, comment) => `${prepareName(name)}: ${type}` + (comment ? ` COMMENT '${encodeStringLiteral(comment)}'` : ''); diff --git a/forward_engineering/helpers/databaseHelper.js b/forward_engineering/helpers/databaseHelper.js index 8c68b5e..11e6361 100644 --- a/forward_engineering/helpers/databaseHelper.js +++ b/forward_engineering/helpers/databaseHelper.js @@ -6,11 +6,11 @@ const { getTab, replaceSpaceWithUnderscore, encodeStringLiteral, - prepareName, getDBVersionNumber, } = require('../utils/general'); const { getCatalogTagsStatement, getSchemaTagsStatement } = require('../helpers/unityTagsHelper'); const { Runtime } = require('../enums/runtime'); +const { prepareName } = require('../../shared/general'); /** * @param {string|undefined} location diff --git a/forward_engineering/helpers/entityHelpers/checkConstraintHelper.js b/forward_engineering/helpers/entityHelpers/checkConstraintHelper.js index cff0781..45870bd 100644 --- a/forward_engineering/helpers/entityHelpers/checkConstraintHelper.js +++ b/forward_engineering/helpers/entityHelpers/checkConstraintHelper.js @@ -1,4 +1,4 @@ -const { wrapInTicks } = require('../../utils/general'); +const { wrapInTicks } = require('../../../shared/general'); /** * @typedef GetStatementsFunction diff --git a/forward_engineering/helpers/entityHelpers/primaryKeyHelper.js b/forward_engineering/helpers/entityHelpers/primaryKeyHelper.js index 924d553..eb92575 100644 --- a/forward_engineering/helpers/entityHelpers/primaryKeyHelper.js +++ b/forward_engineering/helpers/entityHelpers/primaryKeyHelper.js @@ -1,5 +1,6 @@ const _ = require('lodash'); -const { prepareName, getName, getFullEntityName } = require('../../utils/general'); +const { getName, getFullEntityName } = require('../../utils/general'); +const { prepareName } = require('../../../shared/general'); /** * @param entityJsonSchema {Object} diff --git a/forward_engineering/helpers/foreignKeyHelper.js b/forward_engineering/helpers/foreignKeyHelper.js index 24d8002..267bd09 100644 --- a/forward_engineering/helpers/foreignKeyHelper.js +++ b/forward_engineering/helpers/foreignKeyHelper.js @@ -1,7 +1,8 @@ const _ = require('lodash'); const schemaHelper = require('./jsonSchemaHelper'); -const { getName, getTab, commentDeactivatedStatements, prepareName } = require('../utils/general'); +const { getName, getTab, commentDeactivatedStatements } = require('../utils/general'); const ddlTemplates = require('../ddlProvider/ddlTemplates'); +const { prepareName } = require('../../shared/general'); const getIdToNameHashTable = ( relationships, diff --git a/forward_engineering/helpers/indexHelper.js b/forward_engineering/helpers/indexHelper.js index bdf018a..481636e 100644 --- a/forward_engineering/helpers/indexHelper.js +++ b/forward_engineering/helpers/indexHelper.js @@ -1,9 +1,10 @@ 'use strict'; const _ = require('lodash'); -const { getTab, buildStatement, prepareName, getName, replaceSpaceWithUnderscore } = require('../utils/general'); +const { getTab, buildStatement, getName, replaceSpaceWithUnderscore } = require('../utils/general'); const schemaHelper = require('./jsonSchemaHelper'); const { getItemByPath } = require('./jsonSchemaHelper'); +const { prepareName } = require('../../shared/general'); const getIndexStatement = ({ tableName, dbName, columns, options, isActivated }) => { return buildStatement( diff --git a/forward_engineering/helpers/jsonSchemaHelper.js b/forward_engineering/helpers/jsonSchemaHelper.js index 1ae00cd..0faafbf 100644 --- a/forward_engineering/helpers/jsonSchemaHelper.js +++ b/forward_engineering/helpers/jsonSchemaHelper.js @@ -1,6 +1,7 @@ 'use strict'; -const { getName, prepareName } = require('../utils/general'); +const { getName } = require('../utils/general'); +const { prepareName } = require('../../shared/general'); const getPathById = (schema, id, path) => { if (schema.GUID === id) { diff --git a/forward_engineering/helpers/tableHelper.js b/forward_engineering/helpers/tableHelper.js index 5ca4778..0617c16 100644 --- a/forward_engineering/helpers/tableHelper.js +++ b/forward_engineering/helpers/tableHelper.js @@ -6,7 +6,6 @@ const { replaceSpaceWithUnderscore, commentDeactivatedInlineKeys, encodeStringLiteral, - prepareName, getDifferentItems, getFullEntityName, getDBVersionNumber, @@ -25,6 +24,7 @@ const constraintHelper = require('./constrainthelper'); const { getColumnTagsStatement } = require('./unityTagsHelper'); const { Runtime } = require('../enums/runtime'); const { ScheduleTypesEnum } = require('../enums/schedules'); +const { prepareName } = require('../../shared/general'); const getCreateStatement = ({ fullTableName, diff --git a/forward_engineering/helpers/viewHelper.js b/forward_engineering/helpers/viewHelper.js index f42ab4f..2aa5634 100644 --- a/forward_engineering/helpers/viewHelper.js +++ b/forward_engineering/helpers/viewHelper.js @@ -8,7 +8,8 @@ const _ = require('lodash'); const { getKeyNames } = require('./keyHelper'); const { getColumns } = require('./columnHelper'); const { getPartitionKeyStatement, getPartitionsKeys, getClusteringKeys } = require('./tableHelper'); -const { prepareName, encodeStringLiteral, commentDeactivatedStatement } = require('../utils/general'); +const { encodeStringLiteral, commentDeactivatedStatement } = require('../utils/general'); +const { prepareName } = require('../../shared/general'); const getColumnNames = (collectionRefsDefinitionsMap, columns) => { return _.uniq( diff --git a/forward_engineering/sampleGeneration/generateSamples.js b/forward_engineering/sampleGeneration/generateSamples.js index 2914c45..49805a5 100644 --- a/forward_engineering/sampleGeneration/generateSamples.js +++ b/forward_engineering/sampleGeneration/generateSamples.js @@ -1,6 +1,7 @@ const _ = require('lodash'); -const { generateFullEntityNameFromBucketAndTableNames, prepareName } = require('../utils/general'); +const { generateFullEntityNameFromBucketAndTableNames } = require('../utils/general'); const { mapInsertSampleToDml } = require('./mapInsertSampleToDml'); +const { prepareName } = require('../../shared/general'); /** * @param columnIndex {number} diff --git a/forward_engineering/utils/general.js b/forward_engineering/utils/general.js index 5cf25f3..daeb4d7 100644 --- a/forward_engineering/utils/general.js +++ b/forward_engineering/utils/general.js @@ -1,12 +1,8 @@ -'use strict'; - const _ = require('lodash'); const sqlFormatter = require('@sqltools/formatter'); -const { RESERVED_WORDS_AS_ARRAY } = require('../enums/reservedWords'); const { Runtime } = require('../enums/runtime'); const { escapeV6IpForURL } = require('./escapeV6IpForURL'); - -const MAX_STANDARD_ASCII_SYMBOL_CODE = 127; +const { prepareName } = require('../../shared/general'); /** * @typedef {((args: any) => string) | ((args: any) => ChainFunction)} ChainFunction @@ -53,34 +49,6 @@ const buildStatement = (mainStatement, isActivated) => { return chain; }; -const isEscaped = name => /`[\s\S]*`/.test(name); - -const checkContainSpecialCharacters = (name = '') => { - return !/^\w+$/.test(name); -}; - -const prepareName = (name = '') => { - const containSpacesRegexp = /[\s-]/g; - const isEscapedName = isEscaped(name); - const containSpaces = containSpacesRegexp.test(name); - const containSpecialCharacters = checkContainSpecialCharacters(name); - const includeReversedWords = RESERVED_WORDS_AS_ARRAY.includes(name.toLowerCase()); - const containVariableExpression = /\$\{.+\}/g.test(name); - - const shouldBeWrappedInTicks = - !isEscapedName && - (containSpaces || containSpecialCharacters || includeReversedWords || containVariableExpression); - - if (name === '') { - return ''; - } else if (shouldBeWrappedInTicks) { - name = name.replace('`', '``'); - - return wrapInTicks(name); - } else { - return name; - } -}; const replaceSpaceWithUnderscore = (name = '') => { return name.replace(/\s/g, '_'); }; @@ -96,6 +64,7 @@ const getRelationshipName = relationship => { }; const getTab = (tabNum, configData) => (Array.isArray(configData) ? configData[tabNum] || {} : {}); + const indentString = (str, tab = 4) => (str || '') .split('\n') @@ -112,7 +81,7 @@ const getTypeDescriptor = typeName => { descriptors[typeName] = require(`../../types/${typeName}.json`); return descriptors[typeName]; - } catch (e) { + } catch { return {}; } }; @@ -178,10 +147,6 @@ const wrapInSingleQuotes = (str = '') => { return `'${encodeStringLiteral(str)}'`; }; -const wrapInTicks = (str = '') => { - return `\`${str}\``; -}; - const wrapInBrackets = (str = '') => { return /^\(.*\)$/.test(str) ? str : `(${str})`; }; @@ -365,7 +330,6 @@ module.exports = { indentString, getTypeDescriptor, getRelationshipName, - prepareName, replaceSpaceWithUnderscore, replaceDotWithUnderscore, commentDeactivatedStatement, @@ -375,7 +339,6 @@ module.exports = { encodeStringLiteral, buildScript, wrapInSingleQuotes, - wrapInTicks, wrapInBrackets, getEntityData, getFullEntityName, diff --git a/shared/constants.js b/shared/constants.js index d2224c1..957b701 100644 --- a/shared/constants.js +++ b/shared/constants.js @@ -11,7 +11,15 @@ const COMMAND_EXECUTION_STATUS = { const REQUEST_TIMEOUT_MESSAGE = 'Request timeout exceeded, please try again or increase query request timeout in Tools > Options > Reverse-Engineering'; +/** How many columns to pull nullable/index metadata per Python command (avoids “Results too large”). */ +const FIELD_METADATA_COLUMN_BATCH_SIZE = 12; + +/** Truncate long strings in Spark field metadata when serializing indexes (per batch). */ +const FIELD_METADATA_STRING_MAX = 4000; + module.exports = { COMMAND_EXECUTION_STATUS, REQUEST_TIMEOUT_MESSAGE, + FIELD_METADATA_COLUMN_BATCH_SIZE, + FIELD_METADATA_STRING_MAX, }; diff --git a/shared/general.js b/shared/general.js new file mode 100644 index 0000000..50800ab --- /dev/null +++ b/shared/general.js @@ -0,0 +1,39 @@ +const { RESERVED_WORDS_AS_ARRAY } = require('./reservedWords'); + +const isEscaped = name => /`[\s\S]*`/.test(name); + +const checkContainSpecialCharacters = (name = '') => { + return !/^\w+$/.test(name); +}; + +const wrapInTicks = (str = '') => { + return `\`${str}\``; +}; + +const prepareName = (name = '') => { + const containSpacesRegexp = /[\s-]/g; + const isEscapedName = isEscaped(name); + const containSpaces = containSpacesRegexp.test(name); + const containSpecialCharacters = checkContainSpecialCharacters(name); + const includeReversedWords = RESERVED_WORDS_AS_ARRAY.includes(name.toLowerCase()); + const containVariableExpression = /\$\{.+}/g.test(name); + + const shouldBeWrappedInTicks = + !isEscapedName && + (containSpaces || containSpecialCharacters || includeReversedWords || containVariableExpression); + + if (name === '') { + return ''; + } else if (shouldBeWrappedInTicks) { + name = name.replace('`', '``'); + + return wrapInTicks(name); + } else { + return name; + } +}; + +module.exports = { + prepareName, + wrapInTicks, +}; diff --git a/forward_engineering/enums/reservedWords.js b/shared/reservedWords.js similarity index 100% rename from forward_engineering/enums/reservedWords.js rename to shared/reservedWords.js From 1428b47cc34b77066e9ff168c2dadadc006e3150 Mon Sep 17 00:00:00 2001 From: chulanovskyi Date: Thu, 26 Mar 2026 10:26:58 +0200 Subject: [PATCH 3/4] chore: ignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index add0add..7732d42 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ node_modules .DS_Store release +.cursor/plans +.cursor/chats From 61aa81d2c6a0607cc707c0e0181dc99a554694db Mon Sep 17 00:00:00 2001 From: chulanovskyi Date: Thu, 26 Mar 2026 11:31:10 +0200 Subject: [PATCH 4/4] feat: by partial columns fallback --- reverse_engineering/api.js | 1 + .../helpers/databricksHelper.js | 16 +- .../helpers/fetchRequestHelper.js | 419 ++++++++++++++---- .../helpers/pythonScriptGeneratorHelper.js | 94 +++- shared/constants.js | 6 + 5 files changed, 444 insertions(+), 92 deletions(-) diff --git a/reverse_engineering/api.js b/reverse_engineering/api.js index 49225f8..351f71d 100644 --- a/reverse_engineering/api.js +++ b/reverse_engineering/api.js @@ -216,6 +216,7 @@ module.exports = { collections, clusterState.spark_version, logger, + clusterData, ); const ddlByEntity = entitiesDdl.reduce((ddlByEntity, ddlObject) => { const entityName = Object.keys(ddlObject)[0]; diff --git a/reverse_engineering/helpers/databricksHelper.js b/reverse_engineering/helpers/databricksHelper.js index 4ac8722..d9f0c11 100644 --- a/reverse_engineering/helpers/databricksHelper.js +++ b/reverse_engineering/helpers/databricksHelper.js @@ -5,8 +5,13 @@ const async = require('async'); const fetchRequestHelper = require('./fetchRequestHelper'); const { convertCustomTags, cleanEntityName, isSupportGettingListOfViews } = require('./utils'); -const getEntityCreateStatement = (connectionInfo, dbName, entityName, logger) => { - return fetchRequestHelper.fetchCreateStatementRequest(`\`${dbName}\`.\`${entityName}\``, connectionInfo, logger); +const getEntityCreateStatement = (connectionInfo, dbName, entityName, logger, ddlOptions = {}) => { + return fetchRequestHelper.fetchCreateStatementRequest( + `\`${dbName}\`.\`${entityName}\``, + connectionInfo, + logger, + ddlOptions, + ); }; const getFirstDatabaseCollectionName = async (connectionInfo, sparkVersion, logger) => { @@ -136,7 +141,7 @@ const isSupportUnityCatalog = sparkVersion => { const isEnabledUnityCatalog = data_security_mode => ['SINGLE_USER', 'USER_ISOLATION'].includes(data_security_mode); -const getEntitiesDDL = (connectionInfo, databasesNames, collectionsNames, sparkVersion, logger) => { +const getEntitiesDDL = (connectionInfo, databasesNames, collectionsNames, sparkVersion, logger, clusterData) => { const entitiesNames = _.flatMap(databasesNames, dbName => { return (collectionsNames[dbName] || []).map(entityName => ({ dbName, name: entityName })); }); @@ -145,7 +150,10 @@ const getEntitiesDDL = (connectionInfo, databasesNames, collectionsNames, sparkV const entityName = cleanEntityName(sparkVersion, entity.name); logger.log('info', { db: entity.dbName, entity: entityName }, 'Getting entity DDL'); - const ddlStatement = await getEntityCreateStatement(connectionInfo, entity.dbName, entityName, logger); + const resolvedCatalogName = _.get(clusterData, [entity.dbName, 'dbProperties', 'catalogName']); + const ddlStatement = await getEntityCreateStatement(connectionInfo, entity.dbName, entityName, logger, { + resolvedCatalogName, + }); logger.log('info', { db: entity.dbName, entity: entityName }, 'DDL retrieved successfully'); diff --git a/reverse_engineering/helpers/fetchRequestHelper.js b/reverse_engineering/helpers/fetchRequestHelper.js index 5077a2c..3faa8c0 100644 --- a/reverse_engineering/helpers/fetchRequestHelper.js +++ b/reverse_engineering/helpers/fetchRequestHelper.js @@ -4,13 +4,23 @@ const nodeFetch = require('node-fetch'); const AbortController = require('abort-controller'); const { backOff } = require('exponential-backoff'); -const { getClusterData, getViewNamesCommand } = require('./pythonScriptGeneratorHelper'); +const { + getClusterData, + getClusterColumnNames, + getClusterFieldMetadataBatch, + getTableSchemaColumnsForDdlFallback, + getViewNamesCommand, +} = require('./pythonScriptGeneratorHelper'); const { prepareNamesForInsertionIntoScalaCode, removeParentheses } = require('./utils'); const { generateSamplesScript } = require('../../forward_engineering/sampleGeneration/sampleGenerationService'); const { batchProcessFile } = require('./fileHelper'); -const { COMMAND_EXECUTION_STATUS, REQUEST_TIMEOUT_MESSAGE } = require('../../shared/constants'); - -const JSON_OBJECTS_DELIMITER = '}, {'; +const { + COMMAND_EXECUTION_STATUS, + REQUEST_TIMEOUT_MESSAGE, + FIELD_METADATA_COLUMN_BATCH_SIZE, + SPARK_LANGUAGE, +} = require('../../shared/constants'); +const { prepareName } = require('../../shared/general'); const BATCH_SIZE = 5000; const COMMAND_EXECUTION_MAX_DELAY = 3000; @@ -310,7 +320,7 @@ const fetchDatabaseViewsNames = ({ dbName, connectionInfo, logger }) => executeCommand({ connectionInfo, command: `SHOW VIEWS IN \`${dbName}\``, logger }); const fetchDatabaseViewsNamesViaPython = ({ dbName, connectionInfo, logger }) => - executeCommand({ connectionInfo, command: getViewNamesCommand(dbName), language: 'python', logger }); + executeCommand({ connectionInfo, command: getViewNamesCommand(dbName), language: SPARK_LANGUAGE.python, logger }); const fetchClusterTablesNames = ({ dbName, connectionInfo, logger }) => executeCommand({ connectionInfo, command: `SHOW TABLES IN \`${dbName}\``, logger }); @@ -324,28 +334,35 @@ const fetchClusterData = async ( ) => { const databasesPropertiesResult = await async.mapLimit(databasesNames, 40, async dbName => { logger.log('info', '', `Start describe schema: ${dbName} `); + const dbInfoResult = await executeCommand({ connectionInfo, command: `DESCRIBE DATABASE EXTENDED \`${dbName}\``, logger, }); + logger.log('info', '', `Schema: ${dbName} successfully described`); + const dbProperties = dbInfoResult.reduce((dbProperties, row) => { - switch (row[0]) { + const key = row[0]; + const value = row[1]; + + switch (key) { case 'Location': { const propertyName = isManagedLocationSupports ? 'managedLocation' : 'location'; - return { ...dbProperties, [propertyName]: row[1] }; + return { ...dbProperties, [propertyName]: value }; } case 'Comment': - return { ...dbProperties, description: row[1] }; + return { ...dbProperties, description: value }; case 'Properties': - return { ...dbProperties, dbProperties: convertDbProperties(row[1]) }; + return { ...dbProperties, dbProperties: convertDbProperties(value) }; case 'Catalog Name': - return { ...dbProperties, catalogName: row[1] }; + return { ...dbProperties, catalogName: value }; default: return dbProperties; } }, {}); + return { dbName, dbProperties }; }); @@ -355,11 +372,12 @@ const fetchClusterData = async ( ); const databasesTablesInfo = await fetchFieldMetadata(databasesNames, collectionsNames, connectionInfo, logger); + return databasesNames.reduce( (clusterData, dbName) => ({ ...clusterData, [dbName]: { - dbTables: _.get(databasesTablesInfo, dbName, {}), + dbTables: _.get(databasesTablesInfo, dbName, []), dbProperties: _.get(databasesProperties, dbName, {}), }, }), @@ -368,110 +386,240 @@ const fetchClusterData = async ( }; /** - * @param {string} jsonString - * @return {string[]} + * @param {unknown} raw + * @return {string} */ -const splitJsonObjects = (jsonString = '') => jsonString.split(JSON_OBJECTS_DELIMITER); +const coercePythonNotebookOutput = raw => { + if (raw == null) { + return ''; + } + + if (typeof raw === 'string') { + return raw.trim(); + } + + if (Array.isArray(raw)) { + const mapSubArrays = row => row.map(content => String(content ?? '')).join(''); + + return raw.map(row => (Array.isArray(row) ? mapSubArrays(row) : String(row ?? ''))).join('\n'); + } + + if (typeof raw === 'object') { + if (raw.data != null) { + return String(raw.data).trim(); + } + try { + return JSON.stringify(raw); + } catch { + return ''; + } + } + + return String(raw).trim(); +}; /** - * @param {string} corruptedJsonData + * @param {unknown} err * @return {string} */ -const filterCorruptedEnd = corruptedJsonData => { - const jsonObjects = splitJsonObjects(corruptedJsonData); - return jsonObjects.slice(0, jsonObjects.length - 1).join(JSON_OBJECTS_DELIMITER); +const stringifyErrorMessage = err => { + if (err == null) { + return ''; + } + if (typeof err === 'string') { + return err; + } + if (typeof err === 'object' && err.message != null) { + return String(err.message); + } + try { + return JSON.stringify(err); + } catch { + return String(err); + } }; +const isNotebookOutputTruncation = str => /\*\*\* WARNING:.*output/i.test(str); + /** - * @param {string} corruptedJsonData - * @return {string} + * @param {string} message + * @return {boolean} */ -const filterCorruptedStart = corruptedJsonData => { - const jsonObjects = splitJsonObjects(corruptedJsonData); - return jsonObjects.slice(1).join(JSON_OBJECTS_DELIMITER); +const isOutputLimitExceeded = message => + /results too large/i.test(message) || + /max output size exceeded/i.test(message) || + /skipped \d* bytes of output/i.test(message); + +const chunkColumnNames = (names, size) => { + const out = []; + for (let i = 0; i < names.length; i += size) { + out.push(names.slice(i, i + size)); + } + return out; }; +const mergeTableFieldBatches = partials => + partials.reduce( + (acc, partial) => ({ + name: partial.name, + nullableMap: { ...acc.nullableMap, ...partial.nullableMap }, + indexes: { ...acc.indexes, ...partial.indexes }, + }), + { name: '', nullableMap: {}, indexes: {} }, + ); + /** - * @param {string} databasesTablesInfoResult - * @param {boolean} isTruncatedInMiddle - * @return {string} + * @param {Object} params - property bag + * @param {Record>} params.columnPlan + * @param {Object} params.connectionInfo + * @param {Object} params.logger + * @returns {Promise>} */ -const filterCorruptedData = (databasesTablesInfoResult, isTruncatedInMiddle) => { - if (isTruncatedInMiddle) { - const warningDelimiter = '*** WARNING: max output size exceeded, skipping output. ***'; - const [firstChunk, lastChunk] = databasesTablesInfoResult.split(warningDelimiter); - const filteredFirst = filterCorruptedEnd(firstChunk); - const filteredLast = filterCorruptedStart(lastChunk); - const filteredChunks = [filteredFirst, filteredLast].filter(Boolean); - const joined = filteredChunks.join(JSON_OBJECTS_DELIMITER); - return filteredLast ? joined : joined + '}]}'; +const fetchClusterFieldMetadataInBatches = async ({ columnPlan, connectionInfo, logger }) => { + const tasks = []; + + for (const dbName of Object.keys(columnPlan)) { + for (const table of columnPlan[dbName]) { + const batches = chunkColumnNames(table.columns, FIELD_METADATA_COLUMN_BATCH_SIZE); + const batchList = batches.length ? batches : [[]]; + + for (const batch of batchList) { + tasks.push({ dbName, tableName: table.name, batch }); + } + } + } + + const rows = await async.mapLimit(tasks, 10, async ({ dbName, tableName, batch }) => { + const columnsJson = JSON.stringify(batch); + const command = getClusterFieldMetadataBatch(dbName, tableName, columnsJson); + const out = await executeCommand({ connectionInfo, command, language: SPARK_LANGUAGE.python, logger }); + const parsed = JSON.parse(coercePythonNotebookOutput(out)); + return { dbName, tableName, table: parsed }; + }); + + const byTable = new Map(); + + const keySeparator = '_'; + + for (const row of rows) { + const key = `${row.dbName}${keySeparator}${row.tableName}`; + if (!byTable.has(key)) { + byTable.set(key, []); + } + byTable.get(key).push(row.table); + } + + const clusterData = {}; + + for (const dbName of Object.keys(columnPlan)) { + clusterData[dbName] = columnPlan[dbName].map(({ name }) => { + const key = `${dbName}${keySeparator}${name}`; + const parts = byTable.get(key) || []; + return mergeTableFieldBatches(parts); + }); } - return filterCorruptedEnd(databasesTablesInfoResult) + '}]}'; + return clusterData; }; -const fetchFieldMetadata = async (databasesNames, collectionsNames, connectionInfo, logger, previousData = {}) => { +const fetchFieldMetadataBatched = async ( + databasesNames, + collectionsNames, + connectionInfo, + logger, + previousData = {}, +) => { const { tableNames, dbNames } = prepareNamesForInsertionIntoScalaCode(databasesNames, collectionsNames); - const getClusterDataCommand = getClusterData(tableNames.join(', '), dbNames.join(', ')); + + const columnListCommand = getClusterColumnNames(tableNames.join(', '), dbNames.join(', ')); + logger.log( 'info', '', - `Start retrieving tables info: \nDatabases: ${dbNames.join(', ')} \nTables: ${tableNames.join(', ')}`, + `Start retrieving tables info (batched): \nDatabases: ${dbNames.join(', ')} \nTables: ${tableNames.join(', ')}`, ); - const databasesTablesInfoResult = await executeCommand({ + + const namesRaw = await executeCommand({ connectionInfo, - command: getClusterDataCommand, - language: 'python', + command: columnListCommand, + language: SPARK_LANGUAGE.python, logger, }); - logger.log('info', '', `Finish retrieving tables info: ${databasesTablesInfoResult}`); - const isTruncatedResponse = /\*\*\* WARNING: skipped \d* bytes of output \*\*\*$/.test(databasesTablesInfoResult); - const isTruncatedInMiddle = /\*\*\* WARNING: max output size exceeded, skipping output. \*\*\*/.test( - databasesTablesInfoResult, - ); + const namesStr = coercePythonNotebookOutput(namesRaw); - try { - if (!isTruncatedResponse && !isTruncatedInMiddle) { - const parsedData = JSON.parse(databasesTablesInfoResult); - return mergeChunksOfData(previousData, parsedData); - } + if (isNotebookOutputTruncation(namesStr)) { + throw new Error('Databricks truncated the column name list. Try reverse-engineering fewer tables at once.'); + } - const fullCompletedData = filterCorruptedData(databasesTablesInfoResult, isTruncatedInMiddle); - const parsedData = JSON.parse(fullCompletedData); - const mergedDataChunks = mergeChunksOfData(previousData, parsedData); - const { dbNames: filteredDbNames, tableNames: filteredTableNames } = getFilteredEntities( - collectionsNames, - mergedDataChunks, - ); + let columnPlan; - return fetchFieldMetadata(filteredDbNames, filteredTableNames, connectionInfo, logger, mergedDataChunks); + try { + columnPlan = JSON.parse(namesStr); } catch (error) { - logger.log('error', { error }, `\nDatabricks response: ${databasesTablesInfoResult}\n`); + logger.log('error', { error }, `Column list parse failed. Snippet: ${namesStr.slice(0, 1500)}`); throw error; } + + const clusterData = await fetchClusterFieldMetadataInBatches({ columnPlan, connectionInfo, logger }); + + logger.log('info', '', 'Finished retrieving table field metadata (batched).'); + + return mergeChunksOfData(previousData, clusterData); }; -const getFilteredEntities = (tableNames, parsedData) => { - return Object.keys(parsedData).reduce( - (resultEntities, dbName) => { - const parsedTableNames = parsedData[dbName].map(table => table.name); - const dbTableNames = tableNames[dbName]; - const filteredTableNames = dbTableNames.filter(name => !parsedTableNames.includes(name)); - if (!filteredTableNames.length) { - return resultEntities; - } +const fetchFieldMetadata = async (databasesNames, collectionsNames, connectionInfo, logger, previousData = {}) => { + const { tableNames, dbNames } = prepareNamesForInsertionIntoScalaCode(databasesNames, collectionsNames); - return { - dbNames: [...resultEntities.dbNames, dbName], - tableNames: { - ...resultEntities.tableNames, - [dbName]: filteredTableNames, - }, - }; - }, - { dbNames: [], tableNames: {} }, + logger.log( + 'info', + '', + `Start retrieving tables info: \nDatabases: ${dbNames.join(', ')} \nTables: ${tableNames.join(', ')}`, ); + + const getFullClusterInfoCommand = getClusterData(tableNames.join(', '), dbNames.join(', ')); + + try { + const rawOutput = await executeCommand({ + connectionInfo, + command: getFullClusterInfoCommand, + language: SPARK_LANGUAGE.python, + logger, + }); + + const str = coercePythonNotebookOutput(rawOutput); + + if (isNotebookOutputTruncation(str)) { + logger.log('info', '', 'Cluster field metadata output truncated; using batched retrieval.'); + return fetchFieldMetadataBatched(databasesNames, collectionsNames, connectionInfo, logger, previousData); + } + + try { + const parsed = JSON.parse(str); + logger.log('info', '', 'Finished retrieving table field metadata (single command).'); + + return mergeChunksOfData(previousData, parsed); + } catch (parseError) { + logger.log( + 'warning', + { error: parseError }, + 'Single-pass metadata JSON parse failed; using batched retrieval.', + ); + + return fetchFieldMetadataBatched(databasesNames, collectionsNames, connectionInfo, logger, previousData); + } + } catch (error) { + const msg = stringifyErrorMessage(error); + if (isOutputLimitExceeded(msg)) { + logger.log( + 'info', + { message: msg.slice(0, 300) }, + 'Single-pass cluster metadata failed; using batched retrieval.', + ); + return fetchFieldMetadataBatched(databasesNames, collectionsNames, connectionInfo, logger, previousData); + } + throw error; + } }; const mergeChunksOfData = (leftObj, rightObj) => { @@ -482,13 +630,96 @@ const mergeChunksOfData = (leftObj, rightObj) => { }); }; -const fetchCreateStatementRequest = async (entityName, connectionInfo, logger) => { +/** + * @param {string} entityName - `` `default`.`my_table` `` + * @return {{ schemaName: string, tableName: string } | null} + */ +const parseEntityBacktickParts = entityName => { + const parts = new RegExp(/`([^`]+)`\.`([^`]+)`/).exec(String(entityName)); + return parts ? { schemaName: parts[1], tableName: parts[2] } : null; +}; + +/** + * Python notebook context does not inherit USE CATALOG from the SQL context; qualify with catalog when present. + * @param {Object} params - property bag + * @param {string} params.schemaName + * @param {string} params.tableName + * @param {string} [params.catalogName] + * @return {string} + */ +const buildSparkTableFullNameForPython = ({ schemaName, tableName, catalogName } = {}) => { + if (catalogName) { + return `${catalogName}.${schemaName}.${tableName}`; + } + return `${schemaName}.${tableName}`; +}; + +/** + * @param {string} entityName + * @param {Array<{ name: string, colType: string }>} columns + * @return {string} + */ +const buildMinimalCreateTableFromSchema = (entityName, columns) => { + if (!columns.length) { + return ''; + } + + const lines = columns + .map(column => { + return ` \`${String(column.name).replaceAll('`', '')}\` ${column.colType}`; + }) + .join(',\n'); + + return `CREATE TABLE ${entityName} (\n${lines}\n)\nUSING DELTA`; +}; + +const fetchCreateStatementRequest = async (entityName, connectionInfo, logger, ddlOptions = {}) => { try { const result = await executeCommand({ connectionInfo, command: `SHOW CREATE TABLE ${entityName};`, logger }); return _.get(result, '[0][0]', ''); } catch (error) { - logger.log('error', error, `Error during retrieve create table DDL statement. Table name: ${entityName}`); - return ''; + const msg = stringifyErrorMessage(error); + + if (!isOutputLimitExceeded(msg)) { + logger.log('error', error, `Error during retrieve create table DDL statement. Table name: ${entityName}`); + return ''; + } + + logger.log( + 'info', + { message: msg.slice(0, 500) }, + `SHOW CREATE TABLE result too large for ${entityName}; building minimal DDL from Spark schema.`, + ); + + const parts = parseEntityBacktickParts(entityName); + + if (!parts) { + logger.log('warning', { entityName }, 'Cannot parse schema/table for DDL fallback.'); + return ''; + } + + const catalogForFqn = ddlOptions.resolvedCatalogName || connectionInfo.catalogName; + + const fullName = buildSparkTableFullNameForPython({ + schemaName: parts.schemaName, + tableName: parts.tableName, + catalogName: prepareName(catalogForFqn), + }); + + try { + const script = getTableSchemaColumnsForDdlFallback(fullName); + const raw = await executeCommand({ + connectionInfo, + command: script, + language: SPARK_LANGUAGE.python, + logger, + }); + const columns = JSON.parse(coercePythonNotebookOutput(raw)); + return buildMinimalCreateTableFromSchema(entityName, columns); + } catch (fallbackError) { + logger.log('error', fallbackError, `DDL fallback failed for ${entityName}`); + return ''; + } } }; @@ -677,7 +908,7 @@ const getPythonSparkConfig = config => { .join('\n'); }; -const executeCommand = ({ connectionInfo, command, language = 'sql', logger }) => { +const executeCommand = ({ connectionInfo, command, language = SPARK_LANGUAGE.sql, logger }) => { return createContext(connectionInfo, language).then(async contextId => { const payload = { connectionInfo, @@ -689,9 +920,9 @@ const executeCommand = ({ connectionInfo, command, language = 'sql', logger }) = if (connectionInfo.sparkConfig && Object.keys(connectionInfo.sparkConfig).length) { let sparkConfig; - if (language === 'sql') { + if (language === SPARK_LANGUAGE.sql) { sparkConfig = getSqlSparkConfig(connectionInfo.sparkConfig); - } else if (language === 'python') { + } else if (language === SPARK_LANGUAGE.python) { sparkConfig = getPythonSparkConfig(connectionInfo.sparkConfig); } @@ -704,6 +935,22 @@ const executeCommand = ({ connectionInfo, command, language = 'sql', logger }) = }); }; +const getCommandExecutionErrorDetail = body => { + const error = body?.results; + if (error) { + const parts = [error.data, error.cause].filter(v => v != null && String(v).length).map(String); + if (parts.length) { + return parts.join(' | '); + } + } + try { + const s = JSON.stringify(body); + return s.length > 4000 ? `${s.slice(0, 4000)}…` : s; + } catch { + return 'Command execution failed'; + } +}; + const getCommandExecutionResult = ({ query, options, commandOptions }) => { return fetch(query, options) .then(async response => { @@ -734,7 +981,7 @@ const getCommandExecutionResult = ({ query, options, commandOptions }) => { if (body.status === COMMAND_EXECUTION_STATUS.ERROR) { throw { - message: 'Error during receiving command result', + message: getCommandExecutionErrorDetail(body), code: '', description: commandOptions, }; diff --git a/reverse_engineering/helpers/pythonScriptGeneratorHelper.js b/reverse_engineering/helpers/pythonScriptGeneratorHelper.js index 36696d7..74d91a9 100644 --- a/reverse_engineering/helpers/pythonScriptGeneratorHelper.js +++ b/reverse_engineering/helpers/pythonScriptGeneratorHelper.js @@ -1,20 +1,31 @@ // --- Use only spaces in python script literals +const { FIELD_METADATA_STRING_MAX } = require('../../shared/constants'); + +/** Full cluster field metadata in one notebook exit (preferred when output fits). */ const getClusterData = (tablesNames, databasesNames) => ` import json databasesNames = [${databasesNames}] databasesTablesNames = {${tablesNames}} +def metadata_to_dict(m): + try: + if m is None: + return {} + return json.loads(m.json()) + except Exception: + return {} + def getTableFieldMetadata(dbName, tableName): try: tableFieldsMetaInfo = spark.table(dbName + "." + tableName).schema.fields return { "name": tableName, "nullableMap": { field.name: field.nullable for field in tableFieldsMetaInfo }, - "indexes": { field.name: field.metadata for field in tableFieldsMetaInfo } + "indexes": { field.name: metadata_to_dict(field.metadata) for field in tableFieldsMetaInfo } } - except: + except Exception: return { "name": tableName, "nullableMap": {}, @@ -30,6 +41,70 @@ clusterData = { dbName: getDatabaseMetadata(dbName) for dbName in databasesNames dbutils.notebook.exit(json.dumps(clusterData)) `; +const getClusterColumnNames = (tablesNames, databasesNames) => ` +import json + +databasesNames = [${databasesNames}] +databasesTablesNames = {${tablesNames}} + +def column_names_for_table(db_name, table_name): + try: + return [f.name for f in spark.table(db_name + "." + table_name).schema.fields] + except Exception: + return [] + +cluster_column_names = {} +for db_name in databasesNames: + cluster_column_names[db_name] = [ + {"name": table_name, "columns": column_names_for_table(db_name, table_name)} + for table_name in databasesTablesNames.get(db_name, []) + ] + +dbutils.notebook.exit(json.dumps(cluster_column_names)) +`; + +/** + * @param {string} dbName + * @param {string} tableName + * @param {string} columnsJson - JSON array string ('["a","b"]') + */ +const getClusterFieldMetadataBatch = (dbName, tableName, columnsJson) => ` +import json +_db = ${JSON.stringify(dbName)} +_table = ${JSON.stringify(tableName)} +_cols = json.loads(${JSON.stringify(columnsJson)}) + +def truncate_metadata(obj, max_len=${FIELD_METADATA_STRING_MAX}): + if isinstance(obj, dict): + return {k: truncate_metadata(v, max_len) for k, v in obj.items()} + if isinstance(obj, list): + return [truncate_metadata(i, max_len) for i in obj] + if isinstance(obj, str) and len(obj) > max_len: + return obj[:max_len] + "…" + return obj + +def metadata_to_dict(m): + try: + if m is None: + return {} + return truncate_metadata(json.loads(m.json())) + except Exception: + return {} + +try: + _want = set(_cols) + fields = [f for f in spark.table(_db + "." + _table).schema.fields if f.name in _want] + out = { + "name": _table, + "nullableMap": {f.name: f.nullable for f in fields}, + "indexes": {f.name: metadata_to_dict(f.metadata) for f in fields}, + } +except Exception: + out = {"name": _table, "nullableMap": {}, "indexes": {}} + +dbutils.notebook.exit(json.dumps(out)) +`; + const getViewNamesCommand = databaseName => ` import json @@ -37,7 +112,22 @@ viewNames = spark.sql("show views in ${databaseName}").rdd.map(lambda p: p.viewN dbutils.notebook.exit(json.dumps(viewNames)) `; +/** + * Compact column name + Spark type list for building a minimal CREATE TABLE when SHOW CREATE TABLE output is too large. + * Use a 3-part name (catalog.schema.table) when Unity Catalog applies — the Python command context does not share SQL session catalog state. + * @param {string} fqn - e.g. "hive_metastore.default.my_table" or "default.my_table" + */ +const getTableSchemaColumnsForDdlFallback = fqn => ` +import json +_fqn = ${JSON.stringify(fqn)} +_cols = [{"name": f.name, "colType": f.dataType.simpleString()} for f in spark.table(_fqn).schema.fields] +dbutils.notebook.exit(json.dumps(_cols)) +`; + module.exports = { getClusterData, + getClusterColumnNames, + getClusterFieldMetadataBatch, + getTableSchemaColumnsForDdlFallback, getViewNamesCommand, }; diff --git a/shared/constants.js b/shared/constants.js index 957b701..8c669e2 100644 --- a/shared/constants.js +++ b/shared/constants.js @@ -17,9 +17,15 @@ const FIELD_METADATA_COLUMN_BATCH_SIZE = 12; /** Truncate long strings in Spark field metadata when serializing indexes (per batch). */ const FIELD_METADATA_STRING_MAX = 4000; +const SPARK_LANGUAGE = { + python: 'python', + sql: 'sql', +}; + module.exports = { COMMAND_EXECUTION_STATUS, REQUEST_TIMEOUT_MESSAGE, FIELD_METADATA_COLUMN_BATCH_SIZE, FIELD_METADATA_STRING_MAX, + SPARK_LANGUAGE, };