我认为使用日历表可以很简单,例如:
SELECT i.CustID, COUNT( DISTINCT c.calendarDate ) days
FROM #Items i
INNER JOIN calendar.main c ON c.calendarDate Between i.StartDate And i.EndDate
GROUP BY i.CustID
试验台
USE tempdb
GO
-- Cutdown calendar script
IF OBJECT_ID('dbo.calendar') IS NULL
BEGIN
CREATE TABLE dbo.calendar (
calendarId INT IDENTITY(1,1) NOT NULL,
calendarDate DATE NOT NULL,
CONSTRAINT PK_calendar__main PRIMARY KEY ( calendarDate ASC ) WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY],
CONSTRAINT UK_calendar__main UNIQUE NONCLUSTERED ( calendarId ASC ) WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
END
GO
-- Populate calendar table once only
IF NOT EXISTS ( SELECT * FROM dbo.calendar )
BEGIN
-- Populate calendar table
WITH cte AS
(
SELECT 0 x
UNION ALL
SELECT x + 1
FROM cte
WHERE x < 11323 -- Do from year 1 Jan 2000 until 31 Dec 2030 (extend if required)
)
INSERT INTO dbo.calendar ( calendarDate )
SELECT
calendarDate
FROM
(
SELECT
DATEADD( day, x, '1 Jan 2010' ) calendarDate,
DATEADD( month, -7, DATEADD( day, x, '1 Jan 2010' ) ) academicDate
FROM cte
) x
WHERE calendarDate < '1 Jan 2031'
OPTION ( MAXRECURSION 0 )
ALTER INDEX ALL ON dbo.calendar REBUILD
END
GO
IF OBJECT_ID('tempdb..Items') IS NOT NULL DROP TABLE Items
GO
CREATE TABLE dbo.Items
(
CustID INT NOT NULL,
ItemID INT NOT NULL,
StartDate DATE NOT NULL,
EndDate DATE NOT NULL,
INDEX _cdx_Items CLUSTERED ( CustID, StartDate, EndDate )
)
GO
INSERT INTO Items ( CustID, ItemID, StartDate, EndDate )
SELECT 11205, 20009, '2015-01-23', '2015-01-26'
UNION ALL
SELECT 11205, 20010, '2015-01-24', '2015-01-24'
UNION ALL
SELECT 11205, 20011, '2015-01-23', '2015-01-26'
UNION ALL
SELECT 11205, 20012, '2015-01-23', '2015-01-27'
UNION ALL
SELECT 11205, 20012, '2015-01-23', '2015-01-27'
UNION ALL
SELECT 11205, 20012, '2015-01-28', '2015-01-29'
GO
-- Scale up : )
;WITH cte AS (
SELECT TOP 1000000 ROW_NUMBER() OVER ( ORDER BY ( SELECT 1 ) ) rn
FROM master.sys.columns c1
CROSS JOIN master.sys.columns c2
CROSS JOIN master.sys.columns c3
)
INSERT INTO Items ( CustID, ItemID, StartDate, EndDate )
SELECT 11206 + rn % 999, 20012 + rn, DATEADD( day, rn % 333, '1 Jan 2015' ), DATEADD( day, ( rn % 333 ) + rn % 7, '1 Jan 2015' )
FROM cte
GO
--:exit
-- My query: Pros: simple, one copy of items, easy to understand and maintain. Scales well to 1 million + rows.
-- Cons: requires calendar table. Others?
SELECT i.CustID, COUNT( DISTINCT c.calendarDate ) days
FROM dbo.Items i
INNER JOIN dbo.calendar c ON c.calendarDate Between i.StartDate And i.EndDate
GROUP BY i.CustID
--ORDER BY i.CustID
GO
-- Vladimir query: Pros: Effectively same as above
-- Cons: I wouldn't use CROSS APPLY where it's not necessary. Fortunately optimizer simplifies avoiding RBAR (I think).
-- Point of style maybe, but in terms of queries being self-documenting I prefer number 1.
SELECT T.CustID, COUNT( DISTINCT CA.calendarDate ) AS TotalCount
FROM
Items AS T
CROSS APPLY
(
SELECT c.calendarDate
FROM dbo.calendar c
WHERE
c.calendarDate >= T.StartDate
AND c.calendarDate <= T.EndDate
) AS CA
GROUP BY T.CustID
--ORDER BY T.CustID
--WHERE T.CustID = 11205
GO
/* WARNING!! This is commented out as it can't compete in the scale test. Will finish at scale 100, 1,000, 10,000, eventually. I got 38 mins for 10,0000. Pegs CPU.
-- Julian: Pros; does not require calendar table.
-- Cons: over-complicated (eg versus Query 1 in terms of number of lines of code, clauses etc); three copies of dbo.Items table (we have already shown
-- this query is possible with one); does not scale (even at 100,000 rows query ran for 38 minutes on my test rig versus sub-second for first two queries). <<-- this is serious.
-- Indexing could help.
SELECT DISTINCT
CustID,
StartDate = CASE WHEN itmin.StartDate < its.StartDate THEN itmin.StartDate ELSE its.StartDate END
, EndDate = CASE WHEN itmax.EndDate > its.EndDate THEN itmax.EndDate ELSE its.EndDate END
FROM Items its
OUTER APPLY (
SELECT StartDate = MIN(StartDate) FROM Items std
WHERE std.ItemID <> its.ItemID AND (
(std.StartDate <= its.StartDate AND std.EndDate >= its.StartDate)
OR (std.StartDate >= its.StartDate AND std.StartDate <= its.EndDate)
)
) itmin
OUTER APPLY (
SELECT EndDate = MAX(EndDate) FROM Items std
WHERE std.ItemID <> its.ItemID AND (
(std.EndDate >= its.StartDate AND std.EndDate <= its.EndDate)
OR (std.StartDate <= its.EndDate AND std.EndDate >= its.EndDate)
)
) itmax
GO
*/
-- ypercube: Pros; does not require calendar table.
-- Cons: over-complicated (eg versus Query 1 in terms of number of lines of code, clauses etc); four copies of dbo.Items table (we have already shown
-- this query is possible with one); does not scale well; at 1,000,000 rows query ran for 2:20 minutes on my test rig versus sub-second for first two queries.
WITH
start_dates AS
( SELECT CustID, StartDate,
Rn = ROW_NUMBER() OVER (PARTITION BY CustID
ORDER BY StartDate)
FROM items AS i
WHERE NOT EXISTS
( SELECT *
FROM Items AS j
WHERE j.CustID = i.CustID
AND j.StartDate < i.StartDate AND i.StartDate <= j.EndDate
)
GROUP BY CustID, StartDate
),
end_dates AS
( SELECT CustID, EndDate,
Rn = ROW_NUMBER() OVER (PARTITION BY CustID
ORDER BY EndDate)
FROM items AS i
WHERE NOT EXISTS
( SELECT *
FROM Items AS j
WHERE j.CustID = i.CustID
AND j.StartDate <= i.EndDate AND i.EndDate < j.EndDate
)
GROUP BY CustID, EndDate
)
SELECT s.CustID,
Result = SUM( DATEDIFF(day, s.StartDate, e.EndDate) + 1 )
FROM start_dates AS s
JOIN end_dates AS e
ON s.CustID = e.CustID
AND s.Rn = e.Rn
GROUP BY s.CustID ;