-
Notifications
You must be signed in to change notification settings - Fork 127
Expand file tree
/
Copy pathgeneralizator.py
More file actions
130 lines (98 loc) · 3.91 KB
/
generalizator.py
File metadata and controls
130 lines (98 loc) · 3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""Produce a generalised (anonymised) version of a SQL query.
Replaces string literals with ``X``, numbers with ``N``, and
multi-value ``IN (...)`` / ``VALUES (...)`` lists with ``(XYZ)`` so
that structurally identical queries can be grouped for analysis
(e.g. slow-query log aggregation). Based on MediaWiki's
``DatabaseBase::generalizeSQL``.
"""
import re
from sql_metadata.comments import strip_comments
class Generalizator:
"""Produce a generalised form of a SQL query.
Strips comments, removes string literals and numeric values, and
collapses repeated ``LIKE`` / ``IN`` / ``VALUES`` clauses. Designed
for grouping structurally identical queries in monitoring and logging
pipelines.
Used by :attr:`Parser.generalize`, which delegates to
:attr:`Generalizator.generalize`.
:param sql: Raw SQL query string to generalise.
:type sql: str
"""
def __init__(self, sql: str = ""):
"""Initialise with the raw SQL string.
:param sql: SQL query to generalise.
:type sql: str
"""
self._raw_query = sql
# SQL queries normalization (#16)
@staticmethod
def _normalize_likes(sql: str) -> str:
"""Normalise and collapse repeated ``LIKE`` clauses.
Strips ``%`` wildcards, replaces ``LIKE '...'`` with ``LIKE X``,
and collapses consecutive ``or/and ... LIKE X`` clauses into a
single instance with ``...`` suffix.
:param sql: SQL string with LIKE clauses.
:type sql: str
:returns: SQL with LIKE clauses normalised.
:rtype: str
"""
sql = sql.replace("%", "")
# LIKE '%bot'
sql = re.sub(r"LIKE '[^\']+'", "LIKE X", sql)
# or all_groups LIKE X or all_groups LIKE X
found = re.finditer(r"(or|and) [^\s]+ LIKE X", sql, flags=re.IGNORECASE)
like_matches = [m.group(0) for m in found]
if like_matches:
for match in set(like_matches):
sql = re.sub(
r"(\s?" + re.escape(match) + ")+", " " + match + " ...", sql
)
return sql
@property
def without_comments(self) -> str:
"""Return the SQL with all comments removed.
Delegates to :func:`strip_comments` from ``_comments.py``.
:returns: Comment-free SQL string.
:rtype: str
"""
return strip_comments(self._raw_query)
@property
def generalize(self) -> str:
"""Return a generalised version of the SQL query.
Applies the following transformations in order:
1. Strip comments.
2. Remove double-quotes.
3. Collapse multiple spaces.
4. Normalise ``LIKE`` clauses.
5. Replace escaped characters.
6. Replace string literals with ``X``.
7. Collapse whitespace to single spaces.
8. Replace numbers with ``N``.
9. Collapse ``IN (...)`` / ``VALUES (...)`` lists to ``(XYZ)``.
:returns: Generalised SQL string, or ``""`` for empty input.
:rtype: str
"""
if self._raw_query == "":
return ""
# MW comments
# e.g. /* CategoryDataService::getMostVisited N.N.N.N */
sql = self.without_comments
sql = sql.replace('"', "")
# multiple spaces
sql = re.sub(r"\s{2,}", " ", sql)
# handle LIKE statements
sql = self._normalize_likes(sql)
sql = re.sub(r"\\\\", "", sql)
sql = re.sub(r"\\'", "", sql)
sql = re.sub(r'\\"', "", sql)
sql = re.sub(r"'[^\']*'", "X", sql)
sql = re.sub(r'"[^\"]*"', "X", sql)
# All newlines, tabs, etc replaced by single space
sql = re.sub(r"\s+", " ", sql)
# All numbers => N
sql = re.sub(r"-?[0-9]+", "N", sql)
# WHERE foo IN ('880987','882618','708228','522330')
sql = re.sub(
r" (IN|VALUES)\s*\([^,]+,[^)]+\)", " \\1 (XYZ)", sql, flags=re.IGNORECASE
)
return sql.strip()