-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_cleaning_by_PostgreSQL.sql
More file actions
356 lines (302 loc) · 8.55 KB
/
data_cleaning_by_PostgreSQL.sql
File metadata and controls
356 lines (302 loc) · 8.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
-- =====================================================
-- LAYOFFS DATA CLEANING PROJECT
-- Author: Muhammad Akbar Khan
-- Role: M.S in Applied Mathematics | Data Analyst
-- Database: PostgreSQL
-- Data Source: Layoffs dataset from Kaggle
-- Purpose: Comprehensive data cleaning and standardization pipeline
-- Created: 2025-10-08
-- Last Modified: 2025-10-08
-- =====================================================
-- Create the initial raw data table
-- All columns stored as TEXT initially for flexible data ingestion
CREATE TABLE layoffs (
company TEXT,
location TEXT,
industry TEXT,
total_laid_off NUMERIC,
percentage_laid_off NUMERIC,
date TEXT,
stage TEXT,
country TEXT,
funds_raised_millions NUMERIC
);
-- Preview raw dataset contents
SELECT *
FROM layoffs;
-- Detect duplicate records across all key attributes for initial quality profiling
SELECT DISTINCT *
FROM layoffs;
-- Use grouping to detect exact duplicates across all dataset fields
SELECT *,
COUNT(*)
FROM layoffs
GROUP BY company,
location,
industry,
total_laid_off,
percentage_laid_off,
date,
stage,
country,
funds_raised_millions
HAVING COUNT(*) > 1
ORDER BY company;
-- Examine all companies identified with potential duplicate records
SELECT *
FROM layoffs
WHERE company IN (
'Casper',
'Cazoo',
'Hibob',
'Wildlife Studios',
'Yahoo'
)
ORDER BY company;
-- =====================================================
-- DATA DEDUPLICATION
-- Create a clean staging table by removing exact duplicates
-- =====================================================
CREATE TABLE layoffs_staging AS
SELECT DISTINCT *
FROM layoffs;
-- Validate creation of the staging table
SELECT *
FROM layoffs_staging;
-- Confirm that duplicate records have been removed
SELECT DISTINCT *
FROM layoffs_staging;
-- =====================================================
-- COMPANY COLUMN CLEANING
-- =====================================================
SELECT company
FROM layoffs_staging
WHERE company IS NULL
OR company = ''
OR company ILIKE 'NULL';
SELECT DISTINCT company
FROM layoffs_staging;
SELECT company
FROM layoffs_staging
WHERE company != TRIM(company);
UPDATE layoffs_staging
SET company = TRIM(company);
SELECT LOWER(company),
COUNT(DISTINCT company),
ARRAY_AGG(DISTINCT company)
FROM layoffs_staging
GROUP BY LOWER(company)
HAVING COUNT(DISTINCT company) > 1;
SELECT *
FROM layoffs_staging
WHERE company ILIKE 'appgate'
OR company ILIKE 'bytedance'
OR company ILIKE 'clearco'
OR company ILIKE 'curefit'
OR company ILIKE 'salesloft'
ORDER BY company;
UPDATE layoffs_staging
SET company = CASE
WHEN LOWER(company) = 'appgate' THEN 'Appgate'
WHEN LOWER(company) = 'bytedance' THEN 'ByteDance'
WHEN LOWER(company) = 'clearco' THEN 'Clearco'
WHEN LOWER(company) = 'curefit' THEN 'Curefit'
WHEN LOWER(company) = 'salesloft' THEN 'Salesloft'
ELSE company
END;
SELECT *
FROM layoffs_staging AS t1
INNER JOIN layoffs_staging AS t2 ON t1.company < t2.company
WHERE SIMILARITY(t1.company, t2.company) > 0.6
ORDER BY t1.company,
t2.company;
UPDATE layoffs_staging
SET company = 'Impossible Foods'
WHERE company = 'Impossible Foods copy';
-- =====================================================
-- LOCATION COLUMN CLEANING
-- =====================================================
SELECT location
FROM layoffs_staging
WHERE location IS NULL
OR location = ''
OR location ILIKE 'NULL';
SELECT DISTINCT location
FROM layoffs_staging
ORDER BY location;
SELECT location
FROM layoffs_staging
WHERE location != TRIM(location);
SELECT LOWER(location),
COUNT(DISTINCT location)
FROM layoffs_staging
GROUP BY LOWER(location)
HAVING COUNT(DISTINCT location) > 1;
SELECT t1.location AS location_1,
t2.location AS location_2
FROM layoffs_staging AS t1
INNER JOIN layoffs_staging AS t2 ON t1.location < t2.location
WHERE SIMILARITY(t1.location, t2.location) > 0.4;
SELECT *
FROM layoffs_staging
WHERE location ILIKE '%sseldorf'
OR location ILIKE 'Malm%'
ORDER BY location;
UPDATE layoffs_staging
SET location = CASE
WHEN location = 'Dusseldorf' THEN 'Düsseldorf'
WHEN location = 'Malmo' THEN 'Malmö'
ELSE location
END;
-- =====================================================
-- INDUSTRY COLUMN CLEANING
-- =====================================================
SELECT industry
FROM layoffs_staging
WHERE industry IS NULL
OR industry = ''
OR industry ILIKE 'NULL';
UPDATE layoffs_staging
SET industry = NULL
WHERE industry = ''
OR industry ILIKE 'NULL';
SELECT t1.company,
t1.industry AS industry_1,
t2.industry AS industry_2
FROM layoffs_staging AS t1
INNER JOIN layoffs_staging AS t2 ON t1.company = t2.company
WHERE t1.industry IS NULL
AND t2.industry IS NOT NULL
ORDER BY t1.company;
SELECT *
FROM layoffs_staging
WHERE company IN ('Airbnb', 'Carvana', 'Juul')
ORDER BY company;
UPDATE layoffs_staging AS t1
SET industry = t2.industry
FROM layoffs_staging AS t2
WHERE t1.company = t2.company
AND t1.industry IS NULL
AND t2.industry IS NOT NULL;
-- =====================================================
-- DATE COLUMN CLEANING AND CONVERSION
-- =====================================================
SELECT date
FROM layoffs_staging
WHERE date IS NULL
OR date = ''
OR date ILIKE 'NULL';
SELECT date
FROM layoffs_staging;
SELECT date,
TO_DATE(date, 'MM/DD/YYYY')
FROM layoffs_staging;
ALTER TABLE layoffs_staging
ALTER COLUMN date TYPE DATE USING TO_DATE(date, 'MM/DD/YYYY');
SELECT date
FROM layoffs_staging
WHERE date IS NULL;
DELETE FROM layoffs_staging
WHERE date IS NULL;
-- =====================================================
-- STAGE COLUMN CLEANING
-- =====================================================
SELECT stage
FROM layoffs_staging
WHERE stage IS NULL
OR stage = ''
OR stage ILIKE 'NULL';
SELECT DISTINCT stage
FROM layoffs_staging
ORDER BY stage;
-- =====================================================
-- COUNTRY COLUMN CLEANING
-- =====================================================
SELECT country
FROM layoffs_staging
WHERE country IS NULL
OR country = ''
OR country ILIKE 'NULL';
SELECT DISTINCT country
FROM layoffs_staging
ORDER BY country;
SELECT *
FROM layoffs_staging
WHERE country ILIKE 'United States%'
ORDER BY country DESC;
UPDATE layoffs_staging
SET country = 'United States'
WHERE country = 'United States.';
-- =====================================================
-- FUNDS_RAISED_MILLIONS COLUMN
-- =====================================================
SELECT funds_raised_millions
FROM layoffs_staging
WHERE funds_raised_millions IS NULL;
-- =====================================================
-- FINAL DATA QUALITY CHECKS
-- =====================================================
SELECT *
FROM layoffs_staging
WHERE total_laid_off IS NULL
AND percentage_laid_off IS NULL;
DELETE FROM layoffs_staging
WHERE total_laid_off IS NULL
AND percentage_laid_off IS NULL;
-- =====================================================
-- FINAL VALIDATION AND DATA QUALITY ASSESSMENT
-- =====================================================
SELECT COUNT(*) AS final_clean_records
FROM layoffs_staging;
SELECT COUNT(*)
FROM layoffs_staging
WHERE company IS NULL
OR date IS NULL;
SELECT COUNT(*) AS total_records,
COUNT(DISTINCT company) AS unique_companies,
COUNT(DISTINCT industry) AS unique_industries,
COUNT(DISTINCT country) AS unique_countries,
COUNT(DISTINCT location) AS unique_locations,
SUM(
CASE
WHEN total_laid_off IS NOT NULL THEN 1
ELSE 0
END
) AS records_with_layoff_counts,
SUM(
CASE
WHEN percentage_laid_off IS NOT NULL THEN 1
ELSE 0
END
) AS records_with_percentages,
MIN(date) AS earliest_layoff_date,
MAX(date) AS latest_layoff_date
FROM layoffs_staging;
-- =====================================================
-- FINAL CLEANED TABLE CREATION
-- Author: Muhammad Akbar Khan
-- Purpose: Generate the final cleaned and standardized dataset
-- =====================================================
DROP TABLE IF EXISTS layoffs_cleaned;
CREATE TABLE layoffs_cleaned AS
SELECT
company,
location,
industry,
total_laid_off,
percentage_laid_off,
date,
stage,
country,
funds_raised_millions
FROM layoffs_staging;
SELECT COUNT(*) AS total_clean_records
FROM layoffs_cleaned;
SELECT COUNT(*) AS null_critical_fields
FROM layoffs_cleaned
WHERE company IS NULL
OR date IS NULL
OR country IS NULL;
SELECT *
FROM layoffs_cleaned
LIMIT 20;