-
Notifications
You must be signed in to change notification settings - Fork 28
FIX: Encoding Decoding #342
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
… jahnvi/final_linting
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Copilot encountered an error and was unable to review this pull request. You can try again by re-requesting a review.
gargsaumya
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Left a few comments to consider.
📊 Code Coverage Report
Diff CoverageDiff: main...HEAD, staged and unstaged changes
Summary
mssql_python/pybind/ddbc_bindings.cppLines 323-365 323 *strLenOrIndPtr = SQL_LEN_DATA_AT_EXEC(0);
324 bufferLength = 0;
325 } else {
326 // Use Python's codec system to encode the string with specified encoding
! 327 std::string encodedStr;
328
! 329 if (py::isinstance<py::str>(param)) {
330 // Encode Unicode string using the specified encoding
! 331 try {
! 332 py::object encoded = param.attr("encode")(charEncoding, "strict");
! 333 encodedStr = encoded.cast<std::string>();
! 334 LOG("BindParameters: param[%d] SQL_C_CHAR - Encoded with '%s', "
! 335 "size=%zu bytes",
! 336 paramIndex, charEncoding.c_str(), encodedStr.size());
! 337 } catch (const py::error_already_set& e) {
! 338 LOG_ERROR("BindParameters: param[%d] SQL_C_CHAR - Failed to encode "
! 339 "with '%s': %s",
! 340 paramIndex, charEncoding.c_str(), e.what());
! 341 throw std::runtime_error(std::string("Failed to encode parameter ") +
! 342 std::to_string(paramIndex) +
! 343 " with encoding '" + charEncoding +
! 344 "': " + e.what());
! 345 }
! 346 } else {
347 // bytes/bytearray - use as-is (already encoded)
! 348 if (py::isinstance<py::bytes>(param)) {
! 349 encodedStr = param.cast<std::string>();
! 350 } else {
351 // bytearray
! 352 encodedStr = std::string(
! 353 reinterpret_cast<const char*>(PyByteArray_AsString(param.ptr())),
! 354 PyByteArray_Size(param.ptr()));
! 355 }
! 356 LOG("BindParameters: param[%d] SQL_C_CHAR - Using raw bytes, size=%zu",
! 357 paramIndex, encodedStr.size());
! 358 }
359
360 std::string* strParam =
! 361 AllocateParamBuffer<std::string>(paramBuffers, encodedStr);
362 dataPtr = const_cast<void*>(static_cast<const void*>(strParam->c_str()));
363 bufferLength = strParam->size() + 1;
364 strLenOrIndPtr = AllocateParamBuffer<SQLLEN>(paramBuffers);
365 *strLenOrIndPtr = SQL_NTS;Lines 1741-1766 1741 offset += len;
1742 }
1743 } else if (matchedInfo->paramCType == SQL_C_CHAR) {
1744 // Encode the string using the specified encoding
! 1745 std::string encodedStr;
! 1746 try {
! 1747 if (py::isinstance<py::str>(pyObj)) {
! 1748 py::object encoded = pyObj.attr("encode")(charEncoding, "strict");
! 1749 encodedStr = encoded.cast<std::string>();
! 1750 LOG("SQLExecute: DAE SQL_C_CHAR - Encoded with '%s', %zu bytes",
! 1751 charEncoding.c_str(), encodedStr.size());
! 1752 } else {
! 1753 encodedStr = pyObj.cast<std::string>();
! 1754 }
! 1755 } catch (const py::error_already_set& e) {
! 1756 LOG_ERROR("SQLExecute: DAE SQL_C_CHAR - Failed to encode with '%s': %s",
! 1757 charEncoding.c_str(), e.what());
! 1758 throw;
! 1759 }
1760
! 1761 size_t totalBytes = encodedStr.size();
! 1762 const char* dataPtr = encodedStr.data();
1763 size_t offset = 0;
1764 size_t chunkBytes = DAE_CHUNK_SIZE;
1765 while (offset < totalBytes) {
1766 size_t len = std::min(chunkBytes, totalBytes - offset);Lines 2043-2068 2043
2044 if (py::isinstance<py::str>(columnValues[i])) {
2045 // Use Python's codec system to encode the string with specified
2046 // encoding
! 2047 try {
! 2048 py::object encoded =
! 2049 columnValues[i].attr("encode")(charEncoding, "strict");
! 2050 encodedStr = encoded.cast<std::string>();
! 2051 LOG("BindParameterArray: param[%d] row[%zu] SQL_C_CHAR - "
! 2052 "Encoded with '%s', "
! 2053 "size=%zu bytes",
! 2054 paramIndex, i, charEncoding.c_str(), encodedStr.size());
! 2055 } catch (const py::error_already_set& e) {
! 2056 LOG_ERROR("BindParameterArray: param[%d] row[%zu] SQL_C_CHAR - "
! 2057 "Failed to encode "
! 2058 "with '%s': %s",
! 2059 paramIndex, i, charEncoding.c_str(), e.what());
! 2060 throw std::runtime_error(
! 2061 std::string("Failed to encode parameter ") +
! 2062 std::to_string(paramIndex) + " row " + std::to_string(i) +
! 2063 " with encoding '" + charEncoding + "': " + e.what());
! 2064 }
2065 } else {
2066 // bytes/bytearray - use as-is (already encoded)
2067 encodedStr = columnValues[i].cast<std::string>();
2068 }Lines 2070-2078 2070 if (encodedStr.size() > info.columnSize) {
2071 LOG("BindParameterArray: String/binary too "
2072 "long - param_index=%d, row=%zu, size=%zu, "
2073 "max=%zu",
! 2074 paramIndex, i, encodedStr.size(), info.columnSize);
2075 ThrowStdException("Input exceeds column size at index " +
2076 std::to_string(i));
2077 }
2078 std::memcpy(charArray + i * (info.columnSize + 1), encodedStr.c_str(),Lines 2542-2550 2542 py::list rowParams = columnwise_params[rowIndex];
2543
2544 std::vector<std::shared_ptr<void>> paramBuffers;
2545 rc = BindParameters(hStmt, rowParams, const_cast<std::vector<ParamInfo>&>(paramInfos),
! 2546 paramBuffers, charEncoding);
2547 if (!SQL_SUCCEEDED(rc)) {
2548 LOG("SQLExecuteMany: BindParameters failed for row %zu - rc=%d", rowIndex, rc);
2549 return rc;
2550 }Lines 2845-2857 2845 "column %d",
2846 charEncoding.c_str(), buffer.size(), py::len(decoded), colIndex);
2847 return decoded;
2848 } catch (const py::error_already_set& e) {
! 2849 LOG_ERROR("FetchLobColumnData: Failed to decode with '%s' for column %d: %s",
! 2850 charEncoding.c_str(), colIndex, e.what());
2851 // Return raw bytes as fallback
! 2852 return raw_bytes;
! 2853 }
2854 }
2855
2856 // Helper function to retrieve column data
2857 SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, py::list& row,Lines 2926-2939 2926 LOG("SQLGetData: CHAR column %d decoded with '%s', %zu bytes "
2927 "-> %zu chars",
2928 i, charEncoding.c_str(), (size_t)dataLen, py::len(decoded));
2929 } catch (const py::error_already_set& e) {
! 2930 LOG_ERROR(
! 2931 "SQLGetData: Failed to decode CHAR column %d with '%s': %s",
! 2932 i, charEncoding.c_str(), e.what());
2933 // Return raw bytes as fallback
! 2934 row.append(raw_bytes);
! 2935 }
2936 } else {
2937 // Buffer too small, fallback to streaming
2938 LOG("SQLGetData: CHAR column %d data truncated "
2939 "(buffer_size=%zu), using streaming LOB",Lines 2937-2946 2937 // Buffer too small, fallback to streaming
2938 LOG("SQLGetData: CHAR column %d data truncated "
2939 "(buffer_size=%zu), using streaming LOB",
2940 i, dataBuffer.size());
! 2941 row.append(FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false,
! 2942 charEncoding));
2943 }
2944 } else if (dataLen == SQL_NULL_DATA) {
2945 LOG("SQLGetData: Column %d is NULL (CHAR)", i);
2946 row.append(py::none());Lines 3271-3280 3271 if (static_cast<size_t>(dataLen) <= columnSize) {
3272 row.append(py::bytes(
3273 reinterpret_cast<const char*>(dataBuffer.data()), dataLen));
3274 } else {
! 3275 row.append(
! 3276 FetchLobColumnData(hStmt, i, SQL_C_BINARY, false, true, ""));
3277 }
3278 } else if (dataLen == SQL_NULL_DATA) {
3279 row.append(py::none());
3280 } else if (dataLen == 0) {mssql_python/pybind/ddbc_bindings.h📋 Files Needing Attention📉 Files with overall lowest coverage (click to expand)mssql_python.pybind.logger_bridge.hpp: 58.8%
mssql_python.pybind.logger_bridge.cpp: 59.2%
mssql_python.pybind.ddbc_bindings.cpp: 66.2%
mssql_python.row.py: 66.2%
mssql_python.helpers.py: 67.5%
mssql_python.pybind.connection.connection.cpp: 73.6%
mssql_python.pybind.ddbc_bindings.h: 76.6%
mssql_python.ddbc_bindings.py: 79.6%
mssql_python.pybind.connection.connection_pool.cpp: 79.6%
mssql_python.connection.py: 83.7%🔗 Quick Links
|
Work Item / Issue Reference
Summary
This pull request introduces significant improvements to encoding and decoding handling in the MSSQL Python driver, focusing on thread safety, security, and robustness. The main changes include stricter validation and enforcement of encoding rules for SQL_WCHAR types, making encoding/decoding settings thread-safe, and updating cursor methods to consistently use these settings. This ensures correct handling of Unicode data, prevents ambiguous encoding scenarios, and improves reliability in multi-threaded environments.
Encoding and Decoding Validation & Enforcement
'utf-16le'and'utf-16be'encodings are accepted for SQL_WCHAR, explicitly rejecting'utf-16'with BOM due to byte order ambiguity. Programming errors are raised if invalid encodings are used, both insetencodingandsetdecodingmethods.Thread Safety
_encoding_lock) to protect encoding and decoding settings, makingsetencoding,setdecoding,getencoding, andgetdecodingthread-safe and preventing race conditions.Cursor Integration
execute,executemany,fetchone,fetchmany,fetchall) to retrieve encoding and decoding settings from the connection and pass them to low-level bindings, ensuring consistent Unicode handling throughout query execution and result fetchingError Handling and Logging
Bindings Interface Update