-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSQLQuery4.sql
More file actions
63 lines (57 loc) · 4.31 KB
/
SQLQuery4.sql
File metadata and controls
63 lines (57 loc) · 4.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
-- Common Table Expression (CTE) to identify and tag duplicate records
--USE PortfolioProject_MarketingAnalytics;
--SELECT * FROM dbo.customer_journey;
WITH DuplicateRecords AS (
SELECT
JourneyID, -- Select the unique identifier for each journey (and any other columns you want to include in the final result set)
CustomerID, -- Select the unique identifier for each customer
ProductID, -- Select the unique identifier for each product
VisitDate, -- Select the date of the visit, which helps in determining the timeline of customer interactions
Stage, -- Select the stage of the customer journey (e.g., Awareness, Consideration, etc.)
Action, -- Select the action taken by the customer (e.g., View, Click, Purchase)
Duration, -- Select the duration of the action or interaction
-- Use ROW_NUMBER() to assign a unique row number to each record within the partition defined below
ROW_NUMBER() OVER ( --The OVER() clause specifies the "window" or set of rows on which the ROW_NUMBER() function will operate.
-- PARTITION BY groups the rows based on the specified columns that should be unique
PARTITION BY CustomerID, ProductID, VisitDate, Stage, Action
-- ORDER BY defines how to order the rows within each partition (usually by a unique identifier like JourneyID)
ORDER BY JourneyID
) AS row_num -- This creates a new column 'row_num' that numbers each row within its partition
FROM
dbo.customer_journey -- Specifies the source table from which to select the data
)
-- Select all records from the CTE where row_num > 1, which indicates duplicate entries
SELECT *
FROM DuplicateRecords
WHERE row_num > 1 -- Filters out the first occurrence (row_num = 1) and only shows the duplicates (row_num > 1)
ORDER BY JourneyID
-- Outer query selects the final cleaned and standardized data
SELECT
JourneyID, -- Selects the unique identifier for each journey to ensure data traceability
CustomerID, -- Selects the unique identifier for each customer to link journeys to specific customers
ProductID, -- Selects the unique identifier for each product to analyze customer interactions with different products
VisitDate, -- Selects the date of the visit to understand the timeline of customer interactions
Stage, -- Uses the uppercased stage value from the subquery for consistency in analysis
Action, -- Selects the action taken by the customer (e.g., View, Click, Purchase)
COALESCE(Duration, avg_duration) AS Duration -- Replaces missing durations with the average duration for the corresponding date
FROM
(
-- Subquery to process and clean the data
SELECT
JourneyID, -- Selects the unique identifier for each journey to ensure data traceability
CustomerID, -- Selects the unique identifier for each customer to link journeys to specific customers
ProductID, -- Selects the unique identifier for each product to analyze customer interactions with different products
VisitDate, -- Selects the date of the visit to understand the timeline of customer interactions
UPPER(Stage) AS Stage, -- Converts Stage values to uppercase for consistency in data analysis
Action, -- Selects the action taken by the customer (e.g., View, Click, Purchase)
Duration, -- Uses Duration directly, assuming it's already a numeric type
AVG(Duration) OVER (PARTITION BY VisitDate) AS avg_duration, -- Calculates the average duration for each date, using only numeric values
ROW_NUMBER() OVER (
PARTITION BY CustomerID, ProductID, VisitDate, UPPER(Stage), Action -- Groups by these columns to identify duplicate records
ORDER BY JourneyID -- Orders by JourneyID to keep the first occurrence of each duplicate
) AS row_num -- Assigns a row number to each row within the partition to identify duplicates
FROM
dbo.customer_journey -- Specifies the source table from which to select the data
) AS subquery -- Names the subquery for reference in the outer query
WHERE
row_num = 1; -- Keeps only the first occurrence of each duplicate group identified in the subquery